%load_ext rpy2.ipython
import pandas
cpu = pandas.read_csv('./sycl-bench-gold-computecpp-opencl.csv',comment='#')
cpu = pandas.merge(cpu, pandas.read_csv('./sycl-bench-gold-computecpp-cpu.csv',comment='#'),how='outer')
cpu = pandas.merge(cpu, pandas.read_csv('./sycl-bench-gold-dpc++-cpu.csv',comment='#'),how='outer')
cpu = pandas.merge(cpu, pandas.read_csv('./sycl-bench-gold-hipsycl-cpu.csv',comment='#'),how='outer')
cpu = pandas.merge(cpu, pandas.read_csv('./sycl-bench-gold-trisycl-cpu.csv',comment='#'),how='outer')
gpu = pandas.read_csv('./sycl-bench-p100-dpc++-cuda.csv',comment='#')
gpu = pandas.merge(gpu, pandas.read_csv('./sycl-bench-p100-hipsycl-cuda.csv',comment='#'),how='outer')
gpu = pandas.merge(gpu, pandas.read_csv('./sycl-bench-gfx906-hipsycl-rocm.csv',comment='#'),how='outer')
all_res = pandas.merge(gpu,cpu,how='outer')
print("using sample size of:",len(list(map(float,cpu['run-time-samples'][0].split()))),"elements per data-point")
Restructure dataframe to split up the run-time-samples into separate run-time-sample by duplicating each row with a unique sample -- this is for R to do the heavy-lifting by generating the box-and-whisker plots and summary statistics.
from tqdm import tqdm
import os.path
from os import path
if path.exists("./outdat.pkl"):
# read previously created pickle file if it exists
outdat = pandas.read_pickle("./outdat.pkl")
else:
outdat = pandas.DataFrame()
for index, row in tqdm(all_res.iterrows(),total=all_res.shape[0]):
samples = row['run-time-samples']
x = samples.split(' ')
for y in x:
tmprow = row
tmprow['run-time-sample'] = float(y)
outdat = outdat.append(tmprow)
outdat = outdat.drop(columns=['run-time-samples'])
outdat.to_pickle("./outdat.pkl")
all_res = outdat
all_res
We also add the Runtime variable which is named according to the sycl-implementation and device-name. This is needed because all CPU backends cannot query the device name.
def clear_up_runtime (row):
if row['device-name'] == "Device 66af" and row['sycl-implementation'] == "hipSYCL":
return "hipSYCL ROCm - gfx906" # (gfx906)
elif row['device-name'] == "Tesla P100-PCIE-12GB" and row['sycl-implementation'] == "hipSYCL":
return "hipSYCL CUDA - P100"
elif row['device-name'] == "hipCPU OpenMP host device" and row['sycl-implementation'] == "hipSYCL":
return "hipSYCL OpenMP - Gold"
elif row['device-name'] == "Tesla P100-PCIE-12GB" and row['sycl-implementation'] == "LLVM CUDA (Codeplay)":
return "DPC++ CUDA - P100"
elif row['device-name'] == "SYCL host device" and row['sycl-implementation'] == 'LLVM (Intel DPC++)':
return "DPC++ pthreads - Gold"
#todo: generate and check this one:
elif row['device-name'] == "Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz" and row['sycl-implementation'] == 'LLVM (Intel DPC++)':
return "DPC++ OpenCL - Gold"
elif row['device-name'] == "Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz" and row['sycl-implementation'] == 'ComputeCpp':
return "ComputeCpp OpenCL - Gold"
elif row['device-name'] == "Host Device" and row['sycl-implementation'] == 'ComputeCpp':
return "ComputeCpp pthreads - Gold"
elif row['device-name'] == 'unknown' and row['sycl-implementation'] == 'triSYCL':
return "triSYCL OpenMP - Gold"
all_res['Runtime'] = all_res.apply (lambda row: clear_up_runtime(row), axis=1)
Convert these runtimes to factors.
%%R -i all_res -o all_res
all_res$Runtime <- as.factor(all_res$Runtime)
Permanently assign colour to each runtime -- to avoid confusion and colour reuse when plots are broken down into types of accelerator.
%%R -i all_res -o colour_scale -o all_res
# While viridis is a great colour palette, we need high contrast between neighouring elements --like the rainbow palette-- but still need to be colour-blind friendly.
#library('viridisLite')
#colours <- viridisLite::viridis(length(unique(all_res$Runtime)))
#library(RColorBrewer)
#colours <- brewer.pal(length(unique(all_res$Runtime)),'Dark2')
#assign an order to the way SYCL runtimes are presented
all_res$Runtime <- factor(all_res$Runtime, levels = c(
"ComputeCpp pthreads - Gold",
"DPC++ pthreads - Gold",
"hipSYCL OpenMP - Gold",
"triSYCL OpenMP - Gold",
"ComputeCpp OpenCL - Gold",
"DPC++ CUDA - P100",
"hipSYCL CUDA - P100",
"hipSYCL ROCm - gfx906"
))
#all_res$Runtime <- factor(all_res$Runtime, levels = c("ComputeCPP pthreads - Gold", "DPC++ pthreads - Gold", "hipSYCL OpenMP - Gold", "triSYCL OpenMP - Gold", "ComputeCPP OpenCL - Gold", "DPC++ OpenCL - Gold", "DPC++ CUDA - P100","hipSYCL CUDA - P100", "hipSYCL ROCm - gfx906"))
library(scales)
colours <- hue_pal()(length(unique(all_res$Runtime)))
names(colours) <- levels(all_res$Runtime)
colour_scale <- scale_colour_manual(name = "Runtime",values = colours)
indat = all_res
%%R -i indat -i colour_scale -w 10 -h 10 --units in -r 200
library('ggplot2')
library('cowplot')
names(indat) <- make.names(names(indat), unique = FALSE, allow_ = TRUE)
p1 <- ggplot(indat, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + geom_boxplot() + colour_scale
p1
This is obviously too crowded to make any sense of, so let's divide according to SYCL execution construct.
#list all available benchmark names
all_res['Benchmark name'].unique()
roi_wgp = all_res['Benchmark name'].isin(['LinearRegressionCoeff_fp32','LinearRegressionCoeff_fp64','MicroBench_LocalMem_int32_4096','MicroBench_LocalMem_fp32_4096','MicroBench_LocalMem_fp64_4096','NBody_NDRange_fp32','NBody_NDRange_fp64','Pattern_Reduction_NDRange_int32','Pattern_Reduction_NDRange_int64','Pattern_Reduction_NDRange_fp32','Pattern_Reduction_NDRange_fp64','Pattern_SegmentedReduction_NDRange_int16','Pattern_SegmentedReduction_NDRange_int32','Pattern_SegmentedReduction_NDRange_int64','Pattern_SegmentedReduction_NDRange_fp32','Pattern_SegmentedReduction_NDRange_fp64','ScalarProduct_NDRange_int32','ScalarProduct_NDRange_int64','ScalarProduct_NDRange_fp32','ScalarProduct_NDRange_fp64','Runtime_DAGTaskThroughput_NDRangeParallelFor','Runtime_IndependentDAGTaskThroughput_NDRangeParallelFor'])
wgp = all_res[roi_wgp]
roi_hdp = all_res['Benchmark name'].isin(['Runtime_IndependentDAGTaskThroughput_HierarchicalParallelFor','Runtime_DAGTaskThroughput_HierarchicalParallelFor','NBody_Hierarchical_fp32','NBody_Hierarchical_fp64','Pattern_Reduction_Hierarchical_int32','Pattern_Reduction_Hierarchical_int64','Pattern_Reduction_Hierarchical_fp32','Pattern_Reduction_Hierarchical_fp64','Pattern_SegmentedReduction_Hierarchical_int16','Pattern_SegmentedReduction_Hierarchical_int32','Pattern_SegmentedReduction_Hierarchical_int64','Pattern_SegmentedReduction_Hierarchical_fp32','Pattern_SegmentedReduction_Hierarchical_fp64','ScalarProduct_Hierarchical_int32','ScalarProduct_Hierarchical_int64','ScalarProduct_Hierarchical_fp32','ScalarProduct_Hierarchical_fp64'])
hdp = all_res[roi_hdp]
roi_task = all_res['Benchmark name'].isin(['Runtime_IndependentDAGTaskThroughput_SingleTask','Runtime_DAGTaskThroughput_SingleTask'])
task = all_res[roi_task]
roi_sync = all_res['Benchmark name'].isin(['LinearRegressionCoeff_fp32','LinearRegressionCoeff_fp64','MicroBench_LocalMem_int32_4096','MicroBench_LocalMem_fp32_4096','MicroBench_LocalMem_fp64_4096','NBody_NDRange_fp32','NBody_NDRange_fp64','Pattern_Reduction_NDRange_int32','Pattern_Reduction_NDRange_int64','Pattern_Reduction_NDRange_fp32','Pattern_Reduction_NDRange_fp64','Pattern_SegmentedReduction_NDRange_int16','Pattern_SegmentedReduction_NDRange_int32','Pattern_SegmentedReduction_NDRange_int64','Pattern_SegmentedReduction_NDRange_fp32','Pattern_SegmentedReduction_NDRange_fp64','ScalarProduct_NDRange_int32','ScalarProduct_NDRange_int64', 'ScalarProduct_NDRange_fp32','ScalarProduct_NDRange_fp64'])
sync = all_res[roi_sync]
roi_wgp = [ not x for x in roi_wgp]
roi_hdp = [ not x for x in roi_hdp]
roi_task = [ not x for x in roi_task]
roi_sync = [ not x for x in roi_sync]
roi_bkp = [a and b and c and d for a, b, c, d in zip(roi_wgp, roi_hdp, roi_task, roi_sync)]
bkp = all_res[roi_bkp]
Now also chomp of the leading MicroBench_, Runtime_ and Pattern_ -- since the type of computation only lengthens the variable names in the plots
def chompLeadingSYCLBenchExperimentType(_dataframe):
_dataframe['Benchmark name'] = _dataframe['Benchmark name'].str.replace("MicroBench_","")
_dataframe['Benchmark name'] = _dataframe['Benchmark name'].str.replace("Runtime_","")
_dataframe['Benchmark name'] = _dataframe['Benchmark name'].str.replace("Pattern_","")
return(_dataframe)
bkp = chompLeadingSYCLBenchExperimentType(bkp)
bkp
First we extract the kernels which utilize BKP. There are too many results to present in a single plot.
indat = bkp
%%R -i indat -i colour_scale -w 10 -h 10 --units in -r 200
library('ggplot2')
library('cowplot')
names(indat) <- make.names(names(indat), unique = FALSE, allow_ = TRUE)
p1 <- ggplot(indat, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + geom_boxplot() + colour_scale
p1
Still too busy... let's break it down to data-type each benchmark operates on.
#subset just kernels with a verified passing result.
bkp = bkp[bkp['Verification'] == "PASS"]
bkp['Benchmark name'].unique()
#subset just kernels with a verified passing result.
bkp = bkp[bkp['Verification'] == "PASS"]
#we could divide this data-set either by the number of invocations used in application or by the data-type worked with -- we choose the latter.
bkp_fp32 = bkp[bkp['Benchmark name'].str.contains("fp32")]
bkp_fp64 = bkp[bkp['Benchmark name'].str.contains("fp64")]
bkp_int32 = bkp[bkp['Benchmark name'].str.contains("int32")]
bkp_int64 = bkp[bkp['Benchmark name'].str.contains("int64")]
indat = bkp_fp32
%%R -i bkp_fp32 -i bkp_fp64 -i colour_scale -w 8.3 -h 11.7 --units in -r 200
outlier_size = 0.10
library('ggplot2')
library('latex2exp')
names(bkp_fp32) <- make.names(names(bkp_fp32), unique = FALSE, allow_ = TRUE)
p1 <- ggplot(bkp_fp32, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y="Execution Time (s)") + geom_boxplot(outlier.size = outlier_size) + colour_scale
p2 <- ggplot(bkp_fp32, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale
names(bkp_fp64) <- make.names(names(bkp_fp64), unique = FALSE, allow_ = TRUE)
p3 <- ggplot(bkp_fp64, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y="Execution Time (s)") + geom_boxplot(outlier.size = outlier_size) + colour_scale
p4 <- ggplot(bkp_fp64, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale
library('cowplot')
pg <- plot_grid(p1 + theme(legend.position="none"),
p2 + theme(legend.position="none"),
p3 + theme(legend.position="none"),
p4 + theme(legend.position="none"),
labels = c('Float32', '','Float64',''), label_size = 10, align = 'vh', hjust = -2, nrow = 2)
#side legend
#legend <- get_legend(p1 + theme(legend.box.margin = margin(0, 0, 0, 0)))
#plot_grid(pg, legend, rel_widths = c(3, .85))
#bottom legend
legend <- get_legend(p1 + guides(color = guide_legend(nrow = 4)) + theme(legend.position = "bottom"))
plot_grid(pg, legend, rel_heights = c(1, .1),nrow=2)
%%R -i bkp_fp32 -i colour_scale -w 11.7 -h 8.3 --units in -r 200
outlier_size = 0.10
library('ggplot2')
library('latex2exp')
names(bkp_fp32) <- make.names(names(bkp_fp32), unique = FALSE, allow_ = TRUE)
p1 <- ggplot(bkp_fp32, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale
ggsave('bkp-float.pdf', p1, device="pdf",width=11.7, height=8.3, units="in",dpi=320)
print(p1)
%%R -i bkp_int32 -i bkp_int64 -i colour_scale -w 8.3 -h 11.7 --units in -r 200
outlier_size = 0.50
library('ggplot2')
library('latex2exp')
names(bkp_int32) <- make.names(names(bkp_int32), unique = FALSE, allow_ = TRUE)
p5 <- ggplot(bkp_int32, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y="Execution Time (s)") + geom_boxplot(outlier.size = outlier_size) + colour_scale
p6 <- ggplot(bkp_int32, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale
names(bkp_int64) <- make.names(names(bkp_int64), unique = FALSE, allow_ = TRUE)
p7 <- ggplot(bkp_int64, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y="Execution Time (s)") + geom_boxplot(outlier.size = outlier_size) + colour_scale
p8 <- ggplot(bkp_int64, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale
library('cowplot')
pg <- plot_grid(p5 + theme(legend.position="none"),
p6 + theme(legend.position="none"),
p7 + theme(legend.position="none"),
p8 + theme(legend.position="none"),
labels = c('Int32','','Int64',''), label_size = 10, align = 'vh', hjust = -2, nrow = 2)
#side legend
#legend <- get_legend(p1 + theme(legend.box.margin = margin(0, 0, 0, 0)))
#plot_grid(pg, legend, rel_widths = c(3, .85))
#bottom legend
legend <- get_legend(p5 + guides(color = guide_legend(nrow = 4)) + theme(legend.position = "bottom"))
plot_grid(pg, legend, rel_heights = c(1, .1),nrow=2)
#subset just kernels with a verified passing result.
bkp = bkp[bkp['Verification'] == "PASS"]
bkp_block = bkp[bkp['Benchmark name'].str.contains("BlockedTransform_iter_")]
bkp_bandw = bkp[bkp['Benchmark name'].str.contains("HostDeviceBandwidth_")]
#we could divide this data-set either by the number of invocations used in application or by the data-type worked with -- we choose the latter.
bkp_other = bkp[~(bkp['Benchmark name'].str.contains("fp32") | bkp['Benchmark name'].str.contains("fp64") | bkp['Benchmark name'].str.contains("int32") | bkp['Benchmark name'].str.contains("int64") | bkp['Benchmark name'].str.contains("BlockedTransform_iter_") | bkp['Benchmark name'].str.contains("HostDeviceBandwidth_"))]
#discard all but the largest problem size for DagTaskThroughput kernels
bkp_other = bkp_other[ (~ bkp_other['Benchmark name'].str.contains('DAGTaskThroughput')) | (bkp_other['Benchmark name'].str.contains('DAGTaskThroughput') & (bkp_other['problem-size'] == 65536))]
%%R -i bkp_other -i colour_scale -w 8.3 -h 11.7 --units in -r 200
outlier_size = 0.10
library('ggplot2')
library('latex2exp')
names(bkp_other) <- make.names(names(bkp_other), unique = FALSE, allow_ = TRUE)
p1 <- ggplot(bkp_other, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y="Execution Time (s)") + geom_boxplot(outlier.size = outlier_size) + colour_scale
p2 <- ggplot(bkp_other, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale
library('cowplot')
pg <- plot_grid(p1 + theme(legend.position="none"),
p2 + theme(legend.position="none"),
align = 'vh', hjust = -2, nrow = 2)
#bottom legend
legend <- get_legend(p1 + guides(color = guide_legend(nrow = 4)) + theme(legend.position = "bottom"))
plot_grid(pg, legend, rel_heights = c(1, .1),nrow=2)
This is still too noisy to readily compare, so let's separate the results by each device thereby we can directly compare the SYCL runtimes (implementation paired with backend) on the same hardware.
bkp_other_gpu = bkp_other[bkp_other['Runtime'].str.contains("P100") | bkp_other['Runtime'].str.contains("gfx906")]
bkp_other_cpu = bkp_other[bkp_other['Runtime'].str.contains("Gold")]
%%R -i bkp_other_gpu -i colour_scale -i bkp_other_cpu -w 11.7 -h 8.3 --units in -r 200
outlier_size = 0.10
library('ggplot2')
library('latex2exp')
names(bkp_other_cpu) <- make.names(names(bkp_other_cpu), unique = FALSE, allow_ = TRUE)
p1 <- ggplot(bkp_other_cpu, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6), legend.text = element_text(size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y="Execution Time (s)") + geom_boxplot(outlier.size = outlier_size) + colour_scale
p2 <- ggplot(bkp_other_cpu, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6), legend.text = element_text(size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale
names(bkp_other_gpu) <- make.names(names(bkp_other_gpu), unique = FALSE, allow_ = TRUE)
p3 <- ggplot(bkp_other_gpu, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6), legend.text = element_text(size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y="Execution Time (s)") + geom_boxplot(outlier.size = outlier_size) + colour_scale
p4 <- ggplot(bkp_other_gpu, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6), legend.text = element_text(size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale
library('cowplot')
pg <- plot_grid(p1 + theme(legend.position="none"),
p2 + theme(legend.position="none"),
get_legend(p1 + guides(color = guide_legend(ncol = 1)) + theme(legend.position = "right")),
p3 + theme(legend.position="none"),
p4 + theme(legend.position="none"),
get_legend(p3 + guides(color = guide_legend(ncol = 1)) + theme(legend.position = "right")),
labels = c("Other BKP on CPUs","","","Other BKPs on GPUs","",""),
align = 'hv', hjust = -1.85, vjust = -.8, nrow = 2,
rel_widths = c(1,1,.33))
pg
From the source-code (runtime/blocked_transform.cpp:39-49): (The blocked transform) performs a blocked transform operation using the mandelbrot sequence as kernels. The number of iterations of the sequence -- and hence the runtime of the kernel can be adjusted using Num_iterations.
This benchmark processes the data in chunks that are assigned to independent kernels, therefore this benchmark tests
In order for the benchmark to stress these aspects, Num_iterations should be tuned such that the kernel runtime is similar to the data transfer time of one block.
%%R -i bkp_block -i colour_scale -w 8.3 -h 11.7 --units in -r 200
outlier_size = 0.10
library('ggplot2')
library('latex2exp')
names(bkp_block) <- make.names(names(bkp_block), unique = FALSE, allow_ = TRUE)
p1 <- ggplot(bkp_block, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y="Execution Time (s)") + geom_boxplot(outlier.size = outlier_size) + colour_scale
p2 <- ggplot(bkp_block, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale
library('cowplot')
pg <- plot_grid(p1 + theme(legend.position="none"),
p2 + theme(legend.position="none"),
align = 'vh', hjust = -2, nrow = 2)
#bottom legend
legend <- get_legend(p1 + guides(color = guide_legend(nrow = 4)) + theme(legend.position = "bottom"))
plot_grid(pg, legend, rel_heights = c(1, .1),nrow=2)
Sorted by iteration and block-size -- and shorten the name
bkp_block['Benchmark name']
bkp_block['iter'] = bkp_block['Benchmark name'].str.extract(r'iter_(\d+)_')
bkp_block['blocksize'] = bkp_block['Benchmark name'].str.extract(r'blocksize_(\d+)')
bkp_block
%%R -i bkp_block -i colour_scale -w 8.3 -h 11.7 --units in -r 200
outlier_size = 0.10
library('ggplot2')
library('latex2exp')
names(bkp_block) <- make.names(names(bkp_block), unique = FALSE, allow_ = TRUE)
bkp_block$blocksize <- reorder(bkp_block$blocksize, as.numeric(bkp_block$blocksize))
p1 <- ggplot(bkp_block, aes(x=blocksize, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Blocksize", y="Execution Time (s)") + geom_boxplot(outlier.size = outlier_size) + colour_scale
p2 <- ggplot(bkp_block, aes(x=blocksize, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Blocksize", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale
library('cowplot')
pg <- plot_grid(p1 + theme(legend.position="none"),
p2 + theme(legend.position="none"),
align = 'vh', hjust = -2, nrow = 2)
#bottom legend
legend <- get_legend(p1 + guides(color = guide_legend(nrow = 4)) + theme(legend.position = "bottom"))
plot_grid(pg, legend, rel_heights = c(1, .1),nrow=2)
It appears GPU devices perform better over larger blocksizes whereas CPU devices are more affected by increasing the blocksize.
The use of the common CPU backend limits the performance on the Xeon Gold --shown in both the ComputeCpp/CPU and DPC++/CPU runtimes-- which were equally ~an order of magnitude worse than the OpenMP and OpenCL backends. OpenCL had the best performance of any of the backends on the Xeon Gold device -- shown in the ComputeCpp/OpenCL runtime. The hipSYCL/OpenMP has the most variance was the 2nd best performer over all blocksizes.
On the Tesla P100 the CUDA backend performs well in general, as the blocksize increases the hipSYCL implementation slightly wins out over DPC++. The Vega performs worse out the GPUs on this benchmark, however, the performance gap between the common hipSYCL implementation stays the same as the block-sizes increase -- so we can credit this to comparing unequal hardware (with different specifications).
We now also perform a breakdown to examine whether the other variable iter affects the difference in performance on blocksize.
%%R -i bkp_block -i colour_scale -w 8.3 -h 11.7 --units in -r 200
outlier_size = 0.10
library('ggplot2')
library('latex2exp')
names(bkp_block) <- make.names(names(bkp_block), unique = FALSE, allow_ = TRUE)
bkp_block$blocksize <- reorder(bkp_block$blocksize, as.numeric(bkp_block$blocksize))
bkp_block$iter <- reorder(bkp_block$iter, as.numeric(bkp_block$iter))
#rename for plotting
levels(bkp_block$iter) <- paste(levels(bkp_block$iter),"iterations")
p1 <- ggplot(bkp_block, aes(x=blocksize, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Blocksize", y="Execution Time (s)") + geom_boxplot(outlier.size = outlier_size) + facet_wrap( ~ iter, strip.position = "top", scales = "free_x") + colour_scale
p2 <- ggplot(bkp_block, aes(x=blocksize, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Blocksize", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + facet_wrap( ~ iter, strip.position = "top", scales = "free_x") + colour_scale
library('cowplot')
pg <- plot_grid(p1 + theme(legend.position="none"),
p2 + theme(legend.position="none"),
align = 'vh', hjust = -2, nrow = 2)
#bottom legend
legend <- get_legend(p1 + guides(color = guide_legend(nrow = 4)) + theme(legend.position = "bottom"))
plot_grid(pg, legend, rel_heights = c(1, .1),nrow=2)
As expected by increasing the number of iterations of each test that is run, the execution time also increases. This has the largest impact on the CPU backends because these were the worst performing devices -- thus the penalty they incur is highlighted by increasing the amount of work required in each test.
By breaking this down into their respective iterations we can remove a large amount of variance from the previous plot -- this is shown by the much smaller range of the upper and lower quartiles in the box-plots. We also see less variance with the increased sample size.
As such we select the maximum number of iterations to present in our final results.
bkp_block512 = bkp_block[bkp_block['Benchmark name'].str.contains("iter_512")]
%%R -i bkp_block512 -i colour_scale -w 8.3 -h 11.7 --units in -r 200
outlier_size = 0.10
library('ggplot2')
library('latex2exp')
names(bkp_block512) <- make.names(names(bkp_block512), unique = FALSE, allow_ = TRUE)
bkp_block512$blocksize <- reorder(bkp_block512$blocksize, as.numeric(bkp_block512$blocksize))
p1 <- ggplot(bkp_block512, aes(x=blocksize, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Blocksize", y="Execution Time (s)") + geom_boxplot(outlier.size = outlier_size) + colour_scale
p2 <- ggplot(bkp_block512, aes(x=blocksize, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Blocksize", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale
library('cowplot')
pg <- plot_grid(p1 + theme(legend.position="none"),
p2 + theme(legend.position="none"),
align = 'vh', hjust = -2, nrow = 2)
#bottom legend
legend <- get_legend(p1 + guides(color = guide_legend(nrow = 4)) + theme(legend.position = "bottom"))
pgw <- plot_grid(pg, legend, rel_heights = c(1, .1),nrow=2)
print(pgw)
ggsave('figs/bkp_block512.pdf',pgw, width = 8.3, height = 11.7, dpi = 300, units = "in")
%%R -i bkp_block512 -i colour_scale -w 11.7 -h 8.3 --units in -r 200
outlier_size = 0.10
library('ggplot2')
library('latex2exp')
names(bkp_block512) <- make.names(names(bkp_block512), unique = FALSE, allow_ = TRUE)
bkp_block512$blocksize <- reorder(bkp_block512$blocksize, as.numeric(bkp_block512$blocksize))
p1 <- ggplot(bkp_block512, aes(x=blocksize, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Blocksize", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale
ggsave('blocked-transform.pdf', p1, device="pdf",width=11.7, height=8.3, units="in",dpi=320)
print(p1)
This figure presents the performance of the Blocksize test on two GPU devices --the Nvidia Tesla P100 and AMD Vega 20-- and the Xeon Gold CPU. The test measures concurrent kernel execution and overlapping of compute and data transfers. The number of iterations was selected to be 512. The blocksize increases over the x-axis. It appears GPU devices perform better over larger blocksizes whereas CPU devices are more affected by increasing the blocksize.
The use of the common CPU backend limits the performance on the Xeon Gold --shown in both the ComputeCpp/CPU and DPC++/CPU runtimes-- which were equally ~an order of magnitude worse than the OpenMP and OpenCL backends. OpenCL had the best performance of any of the backends on the Xeon Gold device -- shown in the ComputeCpp/OpenCL runtime. The hipSYCL/OpenMP has the most variance was the 2nd best performer over all blocksizes.
On the Tesla P100 the CUDA backend performs well in general, as the blocksize increases the hipSYCL implementation slightly wins out over DPC++. The Vega performs worse out the GPUs on this benchmark, however, the performance gap between the common hipSYCL implementation stays the same as the block-sizes increase, and stays in the same order of magnitude -- so we can credit this to comparing unequal hardware (with different specifications of the P100 and the Vega 20).
TODO: investigate block performance difference between CPU (pthreads?) vs Other backends Too heavyweight to be suited to this SYCL test?.
From the source-code (micro/host_device_bandwidth.cpp:53-64):
Microbenchmark measuring host<->device bandwidth for contiguous and strided copies. For non-strided copies we use a dummy kernel, as explicit copy operations are not fully supported by some SYCL implementations. Strided copies use a larger SYCL buffer and copy a portion out of the middle. For example, a (512, 512) element 2D-copy at offset (1, 1) out of a (514, 514) element SYCL buffer. The host buffer is never strided (as this is not supported by SYCL 1.2.1).
To avoid SYCL implementations to just copy the entire buffer when using a strided accessor we use explicit copy operations for strided copies.
#shorten kernel names
bkp_bandw = bkp[bkp['Benchmark name'].str.contains("HostDeviceBandwidth_")]
bkp_bandw['Benchmark name'] = bkp_bandw['Benchmark name'].str.replace('HostDeviceBandwidth_','')
%%R -i bkp_bandw -i colour_scale -w 8.3 -h 11.7 --units in -r 200
outlier_size = 0.10
library('ggplot2')
library('latex2exp')
names(bkp_bandw) <- make.names(names(bkp_bandw), unique = FALSE, allow_ = TRUE)
#bkp_bandw$Benchmark.name <- reorder(bkp_bandw$Benchmark.name, as.numeric(bkp_bandw$blocksize))
p1 <- ggplot(bkp_bandw, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y="Execution Time (s)") + geom_boxplot(outlier.size = outlier_size) + colour_scale
p2 <- ggplot(bkp_bandw, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale
library('cowplot')
pg <- plot_grid(p1 + theme(legend.position="none"),
p2 + theme(legend.position="none"),
align = 'vh', hjust = -2, nrow = 2)
#bottom legend
legend <- get_legend(p1 + guides(color = guide_legend(nrow = 4)) + theme(legend.position = "bottom"))
plot_grid(pg, legend, rel_heights = c(1, .1),nrow=2)
%%R -i bkp_bandw -i colour_scale -w 11.7 -h 8.3 --units in -r 200
outlier_size = 0.10
library('ggplot2')
library('latex2exp')
names(bkp_bandw) <- make.names(names(bkp_bandw), unique = FALSE, allow_ = TRUE)
p1 <- ggplot(bkp_bandw, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale
ggsave('bandwidth.pdf', p1, device="pdf",width=11.7, height=8.3, units="in",dpi=320)
print(p1)
It appears that communication takes 4 orders of magnitude longer on the GPU than the CPU -- this is unsurprising since it should only be a SYCL API call on the CPU host to transfer to the CPU device whereas the GPU has to go over PCI-E. Let's split the data by device.
bkp_bandw_gpu = bkp_bandw[bkp_bandw['Runtime'].str.contains("P100") | bkp_bandw['Runtime'].str.contains("gfx906")]
bkp_bandw_cpu = bkp_bandw[bkp_bandw['Runtime'].str.contains("Gold")]
%%R -i bkp_bandw_gpu -i bkp_bandw_cpu -i colour_scale -w 11.7 -h 8.3 --units in -r 200
outlier_size = 0.10
library('ggplot2')
library('latex2exp')
names(bkp_bandw_cpu) <- make.names(names(bkp_bandw_cpu), unique = FALSE, allow_ = TRUE)
p1 <- ggplot(bkp_bandw_cpu, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6), legend.text = element_text(size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y="Execution Time (s)") + geom_boxplot(outlier.size = outlier_size) + colour_scale
p2 <- ggplot(bkp_bandw_cpu, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6), legend.text = element_text(size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale
names(bkp_bandw_gpu) <- make.names(names(bkp_bandw_gpu), unique = FALSE, allow_ = TRUE)
p3 <- ggplot(bkp_bandw_gpu, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6), legend.text = element_text(size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y="Execution Time (s)") + geom_boxplot(outlier.size = outlier_size) + colour_scale
p4 <- ggplot(bkp_bandw_gpu, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6), legend.text = element_text(size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale
library('cowplot')
pg <- plot_grid(p1 + theme(legend.position="none"),
p2 + theme(legend.position="none"),
get_legend(p1 + guides(color = guide_legend(ncol = 1)) + theme(legend.position = "right")),
p3 + theme(legend.position="none"),
p4 + theme(legend.position="none"),
get_legend(p3 + guides(color = guide_legend(ncol = 1)) + theme(legend.position = "right")),
labels = c("BKP bandwidth test on CPUs","","","BKP bandwidth test on GPUs","",""),
align = 'hv', hjust = -1.85, vjust = -.8, nrow = 2,
rel_widths = c(1,1,.33))
pg
A direct comparison on the GPU architectures can be made based on suitability of SYCL implementations. For instance,
When we consider
TODO: Summary.
wgp = chompLeadingSYCLBenchExperimentType(wgp)
#subset just kernels with a verified passing result.
wgp = wgp[wgp['Verification'] == "PASS"]
#discard all but the largest problem size for DagTaskThroughput kernels
wgp_sub = wgp[ (~ wgp['Benchmark name'].str.contains('DAGTaskThroughput')) | (wgp['Benchmark name'].str.contains('DAGTaskThroughput') & (wgp['problem-size'] == 65536))]
%%R -i wgp -i wgp_sub -i colour_scale -w 10 -h 10 --units in -r 200
library('ggplot2')
library('cowplot')
names(wgp) <- make.names(names(wgp), unique = FALSE, allow_ = TRUE)
p1 <- ggplot(wgp, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y="Execution Time (s)") + geom_boxplot() + colour_scale
p1
#subset just kernels with a verified passing result.
wgp = wgp[wgp['Verification'] == "PASS"]
wgp['Benchmark name'].unique()
All datum, except for DAGTask applications, can be presented according to their data-type so this seems like a logical way to break-down and interpret the data.
#subset just kernels with a verified passing result.
wgp = wgp[wgp['Verification'] == "PASS"]
#we could divide this data-set either by the number of invocations used in application or by the data-type worked with -- we choose the latter.
wgp_fp = wgp[wgp['Benchmark name'].str.contains("_fp")]
wgp_fp['data.type.width'] = wgp_fp['Benchmark name'].str.extract(r'_fp(\d+)')
wgp_fp['Benchmark name'] = wgp_fp['Benchmark name'].str.replace(r'_fp(\d+)','')
%%R -i wgp_fp -i colour_scale -w 8.3 -h 11.7 --units in -r 200
outlier_size = 0.10
library('ggplot2')
library('latex2exp')
names(wgp_fp) <- make.names(names(wgp_fp), unique = FALSE, allow_ = TRUE)
wgp_fp$data.type.width <- reorder(wgp_fp$data.type.width, as.numeric(wgp_fp$data.type.width))
#rename for plotting
levels(wgp_fp$data.type.width) <- paste("float",levels(wgp_fp$data.type.width),sep='')
p1 <- ggplot(wgp_fp, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y="Execution Time (s)") + geom_boxplot(outlier.size = outlier_size) + facet_wrap( ~ data.type.width, strip.position = "top", scales = "free_x") + colour_scale
p2 <- ggplot(wgp_fp, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + facet_wrap( ~ data.type.width, strip.position = "top", scales = "free_x") + colour_scale
library('cowplot')
pg <- plot_grid(p1 + theme(legend.position="none"),
p2 + theme(legend.position="none"),
align = 'vh', hjust = -2, nrow = 2)
#bottom legend
legend <- get_legend(p1 + guides(color = guide_legend(nrow = 4)) + theme(legend.position = "bottom"))
plot_grid(pg, legend, rel_heights = c(1, .1),nrow=2)
#subset just kernels with a verified passing result.
wgp = wgp[wgp['Verification'] == "PASS"]
#we could divide this data-set either by the number of invocations used in application or by the data-type worked with -- we choose the latter.
wgp_int = wgp[wgp['Benchmark name'].str.contains("_int")]
wgp_int['data.type.width'] = wgp_int['Benchmark name'].str.extract(r'_int(\d+)')
wgp_int['Benchmark name'] = wgp_int['Benchmark name'].str.replace(r'_int(\d+)','')
%%R -i wgp_int -i colour_scale -w 8.3 -h 11.7 --units in -r 200
outlier_size = 0.10
library('ggplot2')
library('latex2exp')
names(wgp_int) <- make.names(names(wgp_int), unique = FALSE, allow_ = TRUE)
wgp_int$data.type.width <- reorder(wgp_int$data.type.width, as.numeric(wgp_int$data.type.width))
#rename for plotting
levels(wgp_int$data.type.width) <- paste("int",levels(wgp_int$data.type.width),sep='')
p1 <- ggplot(wgp_int, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y="Execution Time (s)") + geom_boxplot(outlier.size = outlier_size) + facet_wrap( ~ data.type.width, strip.position = "top", scales = "free_x") + colour_scale
p2 <- ggplot(wgp_int, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + facet_wrap( ~ data.type.width, strip.position = "top", scales = "free_x") + colour_scale
library('cowplot')
pg <- plot_grid(p1 + theme(legend.position="none"),
p2 + theme(legend.position="none"),
align = 'vh', hjust = -2, nrow = 2)
#bottom legend
legend <- get_legend(p1 + guides(color = guide_legend(nrow = 4)) + theme(legend.position = "bottom"))
plot_grid(pg, legend, rel_heights = c(1, .1),nrow=2)
%%R -i wgp_int -i colour_scale -w 11.7 -h 8.3 --units in -r 200
outlier_size = 0.10
library('ggplot2')
library('latex2exp')
names(wgp_int) <- make.names(names(wgp_int), unique = FALSE, allow_ = TRUE)
wgp_int$data.type.width <- reorder(wgp_int$data.type.width, as.numeric(wgp_int$data.type.width))
wgp_int <- subset(wgp_int, data.type.width=="32")
#rename for plotting
levels(wgp_int$data.type.width) <- paste("int",levels(wgp_int$data.type.width),sep='')
p1 <- ggplot(wgp_int, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale
ggsave('wgp-int.pdf', p1, device="pdf",width=11.7, height=8.3, units="in",dpi=320)
print(p1)
#subset just kernels with a verified passing result.
wgp = wgp[wgp['Verification'] == "PASS"]
#we could divide this data-set either by the number of invocations used in application or by the data-type worked with -- we choose the latter.
wgp_fp = wgp[wgp['Benchmark name'].str.contains("_fp")]
wgp_fp['data.type.width'] = wgp_fp['Benchmark name'].str.extract(r'_fp(\d+)')
wgp_fp['Benchmark name'] = wgp_fp['Benchmark name'].str.replace(r'_fp(\d+)','')
%%R -i wgp_fp -i colour_scale -w 11.7 -h 8.3 --units in -r 200
outlier_size = 0.10
library('ggplot2')
library('latex2exp')
names(wgp_fp) <- make.names(names(wgp_fp), unique = FALSE, allow_ = TRUE)
wgp_fp$data.type.width <- reorder(wgp_fp$data.type.width, as.numeric(wgp_fp$data.type.width))
wgp_fp <- subset(wgp_fp, data.type.width=="32")
#rename for plotting
levels(wgp_fp$data.type.width) <- paste("fp",levels(wgp_fp$data.type.width),sep='')
p1 <- ggplot(wgp_fp, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale
ggsave('wgp-float.pdf', p1, device="pdf",width=11.7, height=8.3, units="in",dpi=320)
print(p1)
Let's plot the last couple of kernels without explicit data-types:
Runtime_IndependentDAGTaskThroughput_NDRangeParallelFor and Runtime_DAGTaskThroughput_NDRangeParallelFor.
#subset just kernels with a verified passing result.
wgp = wgp[wgp['Verification'] == "PASS"]
#we could divide this data-set either by the number of invocations used in application or by the data-type worked with -- we choose the latter.
wgp_dag = wgp[wgp['Benchmark name'].str.contains("DAGTaskThroughput")]
%%R -i wgp_dag -i colour_scale -w 8.3 -h 11.7 --units in -r 200
outlier_size = 0.10
library('ggplot2')
library('latex2exp')
names(wgp_dag) <- make.names(names(wgp_dag), unique = FALSE, allow_ = TRUE)
p1 <- ggplot(wgp_dag, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y="Execution Time (s)") + geom_boxplot(outlier.size = outlier_size) + colour_scale
p2 <- ggplot(wgp_dag, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale
library('cowplot')
pg <- plot_grid(p1 + theme(legend.position="none"),
p2 + theme(legend.position="none"),
align = 'vh', hjust = -2, nrow = 2)
#bottom legend
legend <- get_legend(p1 + guides(color = guide_legend(nrow = 4)) + theme(legend.position = "bottom"))
plot_grid(pg, legend, rel_heights = c(1, .1),nrow=2)
From the box-and-whisker plots, we notice there is a large variation in performance on most SYCL Runtimes --with exceptions on the Xeon Gold ComputeCpp/CPU and Vega 20 hipSYCL/ROCm-- and merits additional investigation.
wgp_dag['problem-size'].unique()
Eureka! The data is amalgamated over several different problem sizes. Let's present these results separately.
%%R -i wgp_dag -i colour_scale -w 8.3 -h 11.7 --units in -r 200
outlier_size = 0.10
library('ggplot2')
library('latex2exp')
names(wgp_dag) <- make.names(names(wgp_dag), unique = FALSE, allow_ = TRUE)
wgp_dag$problem.size <- reorder(wgp_dag$problem.size, as.numeric(wgp_dag$problem.size))
#rename for plotting
levels(wgp_dag$problem.size) <- paste("Size: ",levels(wgp_dag$problem.size))
p1 <- ggplot(wgp_dag, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y="Execution Time (s)") + geom_boxplot(outlier.size = outlier_size) + facet_wrap( ~ problem.size, strip.position = "top", scales = "free_x") + colour_scale
p1
#p2 <- ggplot(wgp_dag, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + facet_wrap( ~ problem.size, strip.position = "top", scales = "free_x") + colour_scale
#library('cowplot')
#pg <- plot_grid(p1 + theme(legend.position="none"),
# p2 + theme(legend.position="none"),
# align = 'vh', hjust = -2, nrow = 2)
#bottom legend
#legend <- get_legend(p1 + guides(color = guide_legend(nrow = 4)) + theme(legend.position = "bottom"))
#plot_grid(pg, legend, rel_heights = c(1, .1),nrow=2)
%%R -i wgp_dag -i colour_scale -w 8.3 -h 11.7 --units in -r 200
outlier_size = 0.10
library('ggplot2')
library('latex2exp')
names(wgp_dag) <- make.names(names(wgp_dag), unique = FALSE, allow_ = TRUE)
wgp_dag$problem.size <- reorder(wgp_dag$problem.size, as.numeric(wgp_dag$problem.size))
#rename for plotting
levels(wgp_dag$problem.size) <- paste("Size: ",levels(wgp_dag$problem.size))
p2 <- ggplot(wgp_dag, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + facet_wrap( ~ problem.size, strip.position = "top", scales = "free_x") + colour_scale
p2
#library('cowplot')
#pg <- plot_grid(p1 + theme(legend.position="none"),
# p2 + theme(legend.position="none"),
# align = 'vh', hjust = -2, nrow = 2)
#bottom legend
#legend <- get_legend(p1 + guides(color = guide_legend(nrow = 4)) + theme(legend.position = "bottom"))
#plot_grid(pg, legend, rel_heights = c(1, .1),nrow=2)
As the problem size increases, we see the overall magnitudes of execution time also increase. This is most apparent when examining the hipSYCL/OpenMP runtime on the Xeon Gold -- especially on the TaskThroughput kernel. However the relative ordering remains the same. With the DPC++/CPU runtime performing the best on all problem sizes, followed by the ComputeCpp/OpenCL runtime, then the CUDA backends performing equally regardless of wether hipSYCL or DPC++ is used as the implementation. Since we have the data, we subset the data by solely selecting the largest problem size (65536 or 2^16) whenever these DAG kernels are presented.
#subset just kernels with a verified passing result.
wgp = wgp[wgp['Verification'] == "PASS"]
wgp_sub_dag = wgp[wgp['Benchmark name'].str.contains("DAGTaskThroughput")]
#select just the largest problem size
wgp_sub_dag = wgp_sub_dag[wgp_sub_dag['problem-size'] == 65536]
#rename/shorten names
wgp_sub_dag['Benchmark name'] = wgp_sub_dag['Benchmark name'].str.extract(r'(\w+)_')
%%R -i wgp_sub_dag -i colour_scale -w 8.3 -h 11.7 --units in -r 200
outlier_size = 0.10
library('ggplot2')
library('latex2exp')
names(wgp_sub_dag) <- make.names(names(wgp_sub_dag), unique = FALSE, allow_ = TRUE)
p1 <- ggplot(wgp_sub_dag, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y="Execution Time (s)") + geom_boxplot(outlier.size = outlier_size) + colour_scale
p2 <- ggplot(wgp_sub_dag, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale
library('cowplot')
pg <- plot_grid(p1 + theme(legend.position="none"),
p2 + theme(legend.position="none"),
align = 'vh', hjust = -2, nrow = 2)
#bottom legend
legend <- get_legend(p1 + guides(color = guide_legend(nrow = 4)) + theme(legend.position = "bottom"))
plot_grid(pg, legend, rel_heights = c(1, .1),nrow=2)
SYCL offers compiler level support –doesn’t change the underlying execution model of the kernel– for expressing the hierarchical nature of data-parallelism –such as work-groups– and can be used for performance tuning. A range is provided into the enqueuing functions to specify the number of work-groups to launch and an optional size of each work-group.
parallel_for_work_item: Use of this function in the suite indicates there has been an attempt made to optimize the application to use private memory. This corresponds to the lowest level cache / smallest-faster memory on the accelerator. 6 of the 37 applications examined make use of enqueuing via parallel_for_work_item, namely,dag_task_throughput_independent (1), dag_task_throughput_sequential (1), nbody (4), scalar_prod (4), segmentedreduction (3) and reduction (3).
parallel_for_work_group: Presents a degree of optimization around the use of local memory, because all variables declared in this scope are allocated in workgroup local memory. The same applications that use parallel_for_work_item also use parallel_for_work_group. The number of times they are used differ; dag_task_throughput_independent (1), dag_task_throughput_sequential (1), n-body (1), scalar_prod (2), segmentedreduction (1) and reduction (1).
hdp = chompLeadingSYCLBenchExperimentType(hdp)
#subset just kernels with a verified passing result.
hdp = hdp[hdp['Verification'] == "PASS"]
#discard all but the largest problem size for DagTaskThroughput kernels
hdp = hdp[ (~ hdp['Benchmark name'].str.contains('DAGTaskThroughput')) | (hdp['Benchmark name'].str.contains('DAGTaskThroughput') & (hdp['problem-size'] == 65536))]
%%R -i hdp -i colour_scale -w 8.3 -h 11.7 --units in -r 200
outlier_size = 0.10
library('ggplot2')
library('latex2exp')
names(hdp) <- make.names(names(hdp), unique = FALSE, allow_ = TRUE)
p1 <- ggplot(hdp, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y="Execution Time (s)") + geom_boxplot(outlier.size = outlier_size) + colour_scale
p2 <- ggplot(hdp, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale
library('cowplot')
pg <- plot_grid(p1 + theme(legend.position="none"),
p2 + theme(legend.position="none"),
align = 'vh', hjust = -2, nrow = 2)
#bottom legend
legend <- get_legend(p1 + guides(color = guide_legend(nrow = 4)) + theme(legend.position = "bottom"))
plot_grid(pg, legend, rel_heights = c(1, .1),nrow=2)
%%R -i hdp -i colour_scale -w 11.7 -h 8.3 --units in -r 200
outlier_size = 0.10
library('ggplot2')
library('latex2exp')
names(hdp) <- make.names(names(hdp), unique = FALSE, allow_ = TRUE)
p1 <- ggplot(hdp, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 50, hjust = 1, size = 5)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale
ggsave('hdp-dt.pdf', p1, device="pdf",width=11.7, height=8.3, units="in",dpi=320)
print(p1)
A kernel is executed once, conceptually, on a single compute-unit, in one work-group, as one work-item; these kernels can be executed on multiple devices and queues and encompass task-based parallelism.
It is used with the single_task function. In the suite, 3 applications use this construct; dag_task_throughput_sequential (1), dag_task_throughput_independent (1) and host_device_bandwidth (1). However, host_device_bandwidth submits a non-operation single_task to force a read-only buffer to be copied in the micro-benchmark, since this kernel does nowork it is omitted from the evaluation.
task = chompLeadingSYCLBenchExperimentType(task)
#subset just kernels with a verified passing result.
task = task[task['Verification'] == "PASS"]
%%R -i task -i colour_scale -w 8.3 -h 11.7 --units in -r 200
outlier_size = 0.10
library('ggplot2')
library('latex2exp')
names(task) <- make.names(names(task), unique = FALSE, allow_ = TRUE)
p1 <- ggplot(task, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y="Execution Time (s)") + geom_boxplot(outlier.size = outlier_size) + colour_scale
p2 <- ggplot(task, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale
library('cowplot')
pg <- plot_grid(p1 + theme(legend.position="none"),
p2 + theme(legend.position="none"),
align = 'vh', hjust = -2, nrow = 2)
#bottom legend
legend <- get_legend(p1 + guides(color = guide_legend(nrow = 4)) + theme(legend.position = "bottom"))
plot_grid(pg, legend, rel_heights = c(1, .1),nrow=2)
%%R -i task -i colour_scale -w 11.7 -h 8.3 --units in -r 200
outlier_size = 0.10
library('ggplot2')
library('latex2exp')
names(task) <- make.names(names(task), unique = FALSE, allow_ = TRUE)
p1 <- ggplot(task, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale
ggsave('task.pdf', p1, device="pdf",width=11.7, height=8.3, units="in",dpi=320)
print(p1)
In general, operations between the host and the device(s) will require synchronization; such as buffer destruction, host accessors, command group enqueue, queue operations etc.
Instead, we focus on user-controllable synchronization events: those that occur within a kernels execution, either globally or locally – within a work-group.
The barrier function is used inside kernels to synchronize between work-items in a work-group.
It is used in 7 of the 37 kernels, namely, reduction (1), segmentedreduction (1), lin_reg_coeff (2), scalar_prod (2), nbody (2), local_mem (2). lin_reg_coeff, scalar_prod and local_mem request a local_space fence synchronization within a work-group whereas reduction, segmented_reduction and nbody use the default global barrier.
NDRange versions of these kernels are the ones which contain barriers – the hierarchical variations do not.
The nbody kernel contains two barriers in the same invocation, as does the local_mem benchmark. Reduction contains one barrier in the inner-most loop of the NDRange implementation.
sync = chompLeadingSYCLBenchExperimentType(sync)
#subset just kernels with a verified passing result.
sync = sync[sync['Verification'] == "PASS"]
%%R -i sync -i colour_scale -w 8.3 -h 11.7 --units in -r 200
outlier_size = 0.10
library('ggplot2')
library('latex2exp')
names(sync) <- make.names(names(sync), unique = FALSE, allow_ = TRUE)
p1 <- ggplot(sync, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y="Execution Time (s)") + geom_boxplot(outlier.size = outlier_size) + colour_scale
p2 <- ggplot(sync, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale
library('cowplot')
pg <- plot_grid(p1 + theme(legend.position="none"),
p2 + theme(legend.position="none"),
align = 'vh', hjust = -2, nrow = 2)
#bottom legend
legend <- get_legend(p1 + guides(color = guide_legend(nrow = 4)) + theme(legend.position = "bottom"))
plot_grid(pg, legend, rel_heights = c(1, .1),nrow=2)
%%R -i sync -i colour_scale -w 11.7 -h 8.3 --units in -r 200
outlier_size = 0.10
library('ggplot2')
library('latex2exp')
names(sync) <- make.names(names(sync), unique = FALSE, allow_ = TRUE)
p1 <- ggplot(sync, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale
ggsave('sync.pdf', p1, device="pdf",width=11.7, height=8.3, units="in",dpi=320)
print(p1)
! rm -r ./computecpp-benchmarks
! mkdir ./computecpp-benchmarks && cd ./computecpp-benchmarks && cmake ../.. -DSYCL_IMPL=ComputeCpp -DCMAKE_PREFIX_PATH=/tmp/ComputeCpp-latest && make -j16 && rm -r Makefile CMakeCache.txt CMakeFiles && cd ..
Update the default symlink for the running suite to operate on.
! rm -r ./benchmarks
! ln -s ./computecpp-benchmarks ./benchmarks
Run the benchmarks on CPU:
! ./run-suite cpu
Rename the results to something more descriptive.
! tail -c +1 sycl-bench.csv > sycl-bench.csv
! mv ./sycl-bench.csv ./sycl-bench-gold-computecpp.csv
Note: ComputeCPP's support on Nvidia GPUs is expermental -- it uses a PTX backend. Most benchmarks fail to compile with it so it is omitted.
Compile with OpenCL, CPU and CUDA/PTX backends.
Note: We expect some benchmarks to fail to build as at the time of investigation (May 2020) SYCL on CUDA in DPC++ had ~60% of features implemented.
Note: We cannot collect individual kernel times with DPC++ because the benchmark suite uses they SYCL profiling info construct which is not supported on host device.
We must hide the OpenCL ICDs during compilation time to force the CPU backend.
! mv /etc/OpenCL/vendors/amd.icd /etc/OpenCL/vendors/amd.icdX
! mv /etc/OpenCL/vendors/nvidia.icd /etc/OpenCL/vendors/nvidia.icdX
! rm -r ./dpc++-cpu-benchmarks
! mkdir ./dpc++-cpu-benchmarks && cd ./dpc++-cpu-benchmarks && cmake ../.. -DSYCL_IMPL=LLVM -DDPC++_INSTALL_DIR=/tmp/llvm-sycl/build/install && make -j16 --keep-going
! cd ./dpc++-cpu-benchmarks && rm -r Makefile CMakeCache.txt CMakeFiles cmake_install.cmake && cd ..
! mv /etc/OpenCL/vendors/amd.icdX /etc/OpenCL/vendors/amd.icd
! mv /etc/OpenCL/vendors/nvidia.icdX /etc/OpenCL/vendors/nvidia.icd
Update the default symlink for the running suite to operate on.
! rm -r ./benchmarks
! ln -s ./dpc++-benchmarks ./benchmarks
Run the benchmarks on CPU:
! rm -r ./sycl-bench.csv
! LD_LIBRARY_PATH=${LD_LIBARARY_PATH}:/tmp/llvm-sycl/build/install/lib ./run-suite cpu
Rename the results to something more descriptive.
! tail -c +1 sycl-bench.csv > sycl-bench.csv
! mv ./sycl-bench.csv ./sycl-bench-gold-kt-dpc++-cpu.csv
Note if running this from within the docker image you may need to rebuild the DPC++ CUDA backend in the running Docker instance -- I believe this may be due to missing CUDA runtime stubs missing during the docker build phase but haven't investigated further. To rebuild the backend:
cd /tmp
wget https://github.com/intel/llvm/archive/sycl.zip -O /tmp/llvm-sycl.zip
unzip /tmp/llvm-sycl.zip -d /tmp
mv /tmp/llvm-sycl /tmp/llvm-sycl-cuda
mkdir /tmp/llvm-sycl-cuda/build
cd /tmp/llvm-sycl-cuda/build
CC=gcc CXX=g++ CMAKE_LIBRARY_PATH=/usr/local/cuda/lib64/stubs python /tmp/llvm-sycl-cuda/buildbot/configure.py --cuda
python /tmp/llvm-sycl-cuda/buildbot/compile.py
Build the suite against the DPC++ CUDA backend.
! rm -r ./dpc++-cuda-benchmarks
! mkdir ./dpc++-cuda-benchmarks && cd ./dpc++-cuda-benchmarks && cmake ../.. -DSYCL_IMPL=LLVM-CUDA -DDPC++_INSTALL_DIR=/tmp/llvm-sycl-cuda/build/install && make -j16 --keep-going
! cd ./dpc++-cuda-benchmarks && rm -r Makefile CMakeCache.txt CMakeFiles && cd ..
Update the default symlink for the running suite to operate on.
! rm -r ./benchmarks
! ln -s ./dpc++-cuda-benchmarks ./benchmarks
Run the benchmarks on GPU:
! rm -r ./sycl-bench.csv
! LD_LIBRARY_PATH=${LD_LIBARARY_PATH}:/tmp/llvm-sycl-cuda/build/install/lib ./run-suite gpu
Rename the results to something more descriptive.
! tail -c +1 sycl-bench.csv > sycl-bench.csv
! mv ./sycl-bench.csv ./sycl-bench-p100-kt-dpc++-cuda.csv
! rm -r ./hipsycl-cpu-kt-benchmarks
! mkdir ./hipsycl-cpu-kt-benchmarks && cd ./hipsycl-cpu-kt-benchmarks && cmake ../.. -DSYCL_IMPL=hipSYCL -DhipSYCL_DIR=/opt/hipSYCL/lib/cmake -DHIPSYCL_PLATFORM=cpu && make -j16 --keep-going && rm -r Makefile CMakeCache.txt CMakeFiles && cd ..
Update the default symlink for the running suite to operate on.
! rm -r ./benchmarks
! ln -s ./hipsycl-cpu-kt-benchmarks ./benchmarks
Run the benchmarks on CPU:
! ./run-suite cpu
Rename the results to something more descriptive.
! tail -c +1 sycl-bench.csv > sycl-bench.csv
! mv ./sycl-bench.csv ./sycl-bench-gold-kt-hipsycl-cpu.csv
Compile with hipSYCL-rocm and remove non-applications from the final build.
! rm -r ./hipsycl-rocm-benchmarks
! mkdir ./hipsycl-rocm-benchmarks && cd ./hipsycl-rocm-benchmarks && cmake .. -DSYCL_IMPL=hipSYCL -DhipSYCL_DIR=/opt/hipSYCL/lib/cmake -DHIPSYCL_PLATFORM=rocm -DHIPSYCL_GPU_ARCH=gfx906 && make -j16 && rm -r Makefile CMakeCache.txt CMakeFiles && cd ..
Update the default symlink for the running suite to operate on.
! rm -r ./benchmarks
! ln -s ./hipsycl-rocm-benchmarks ./benchmarks
Run the benchmarks on CPU:
! ./run-suite gpu
Rename the results to something more descriptive.
! tail -c +1 sycl-bench.csv > sycl-bench.csv
! mv ./sycl-bench.csv ./sycl-bench-gfx906-hipsycl-rocm.csv
perf is installed with:
apt install linux-tools-5.4.0-42-generic linux-tools-generic
proxy -- /etc/apt/apt.conf:
Acquire::http::Proxy "http://proxy.ftpn.ornl.gov:3128";
Acquire::https::Proxy "http://proxy.ftpn.ornl.gov:3128";
The environment may need to be reinstalled within a running docker instance, this is achieved with the following:
apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends curl gnupg && \
curl -sL http://repo.radeon.com/rocm/apt/debian/rocm.gpg.key | apt-key add - && \
sh -c 'echo deb [arch=amd64] http://repo.radeon.com/rocm/apt/debian/ xenial main > /etc/apt/sources.list.d/rocm.list' && \
apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
sudo \
libelf1 \
libnuma-dev \
build-essential \
git \
vim-nox \
cmake-curses-gui \
kmod \
file \
rocm-dev
Compile with hipSYCL-rocm and remove non-applications from the final build.
! rm -r ./hipsycl-rocm-benchmarks
! mkdir ./hipsycl-rocm-benchmarks && cd ./hipsycl-rocm-benchmarks && cmake .. -DSYCL_IMPL=hipSYCL -DhipSYCL_DIR=/opt/hipSYCL/lib/cmake -DHIPSYCL_PLATFORM=rocm -DHIPSYCL_GPU_ARCH=gfx906 && make -j16 && rm -r Makefile CMakeCache.txt CMakeFiles && cd ..
Update the default symlink for the running suite to operate on.
! rm -r ./benchmarks
! ln -s ./hipsycl-rocm-benchmarks ./benchmarks
Run the benchmarks on CPU:
! ./run-suite gpu
Rename the results to something more descriptive.
! tail -c +1 sycl-bench.csv > sycl-bench.csv
! mv ./sycl-bench.csv ./sycl-bench-gfx906-hipsycl-rocm.csv
Compile with hipSYCL-cuda and remove non-applications from the final build.
! rm -r ./hipsycl-cuda-benchmarks
! mkdir ./hipsycl-cuda-benchmarks && cd ./hipsycl-cuda-benchmarks && cmake .. -DSYCL_IMPL=hipSYCL -DhipSYCL_DIR=/opt/hipSYCL/lib/cmake -DHIPSYCL_PLATFORM=cuda -DHIPSYCL_GPU_ARCH=sm_60 && make -j16 && rm -r Makefile CMakeCache.txt CMakeFiles && cd ..
Update the default symlink for the running suite to operate on.
! rm -r ./benchmarks
! ln -s ./hipsycl-cua-benchmarks ./benchmarks
Run the benchmarks on GPU:
! ./run-suite gpu
Rename the results to something more descriptive.
! tail -c +1 sycl-bench.csv > sycl-bench.csv
! mv ./sycl-bench.csv ./sycl-bench-p100-hipsycl-cuda.csv
! rm -r ./trisycl-cpu-benchmarks
! mkdir ./trisycl-cpu-benchmarks && cd ./trisycl-cpu-benchmarks && cmake ../.. -DSYCL_IMPL=triSYCL -DTRISYCL_TBB=ON -DTRISYCL_INCLUDE_DIR=/tmp/triSYCL-master/include && make -j16 --keep-going
! cd ./trisycl-cpu-benchmarks && rm -r Makefile CMakeCache.txt CMakeFiles && cd ..
Or if you want to build with just TBB and no OpenMP use:
! rm -r ./trisycl-cpu-benchmarks
! mkdir ./trisycl-cpu-benchmarks && cd ./trisycl-cpu-benchmarks && cmake ../.. -DSYCL_IMPL=triSYCL -DTRISYCL_TBB=ON -DTRISYCL_OPENMP=OFF -DTRISYCL_INCLUDE_DIR=/tmp/triSYCL-master/include && make -j16 --keep-going
! cd ./trisycl-cpu-benchmarks && rm -r Makefile CMakeCache.txt CMakeFiles && cd ..
Or with OpenMP and no TBB:
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib CC=/opt/hipSYCL/llvm/bin/clang CXX=/opt/hipSYCL/llvm/bin/clang++ cmake .. -DSYCL_IMPL=triSYCL -DTRISYCL_TBB=OFF -DTRISYCL_OMP=ON -DTRISYCL_INCLUDE_DIR=/tmp/triSYCL-master/include
Update the default symlink for the running suite to operate on.
! rm -r ./benchmarks
! ln -s ./trisycl-cpu-benchmarks ./benchmarks
Run the benchmarks on CPU:
! rm ./sycl-bench.csv
! ./run-suite default
Remove the first comment character from the very first line in the results -- we want to use these column names in the analysis. Then rename the results to something more descriptive.
! tail -c +2 sycl-bench.csv > sycl-bench.csv
! mv ./sycl-bench.csv ./sycl-bench-gold-trisycl-cpu.csv
Given a workgroup size of 256, the global work ranges from 256 (2^8) to 1048576 (2^20) increasing by a power of two in each of these contrived vector addition tests. Thus on this 32-core (hyperthreaded architecture) Gold CPU we should see a plateau in performance at 8192 () and onwards -- assuming no overheads in the SYCL implementation.
import pandas
vec = pandas.read_csv('./vec_add_ComputeCPP-cpu.csv',comment='#')
vec = pandas.merge(vec, pandas.read_csv('./vec_add_dpc++-cpu.csv',comment='#'),how='outer')
vec = pandas.merge(vec, pandas.read_csv('./vec_add_hipsycl-cpu.csv',comment='#'),how='outer')
vec = pandas.merge(vec, pandas.read_csv('./vec_add_trisycl-cpu.csv',comment='#'),how='outer')
vec = pandas.merge(vec, pandas.read_csv('./vec_add_ComputeCPP-opencl.csv',comment='#'),how='outer')
print("using sample size of:",len(list(map(float,vec['run-time-samples'][0].split()))),"elements per data-point")
Restructure dataframe to split up the run-time-samples into separate run-time-sample by duplicating each row with a unique sample -- this is for R to do the heavy-lifting by generating the box-and-whisker plots and summary statistics.
from tqdm import tqdm
outdat = pandas.DataFrame()
for index, row in tqdm(vec.iterrows(),total=vec.shape[0]):
samples = row['run-time-samples']
x = samples.split(' ')
for y in x:
tmprow = row
tmprow['run-time-sample'] = float(y)
outdat = outdat.append(tmprow)
outdat = outdat.drop(columns=['run-time-samples'])
vec = outdat
vec
We also add the Runtime variable which is named according to the sycl-implementation and device-name. This is needed because all CPU backends cannot query the device name.
def clear_up_runtime (row):
if row['device-name'] == "Device 66af" and row['sycl-implementation'] == "hipSYCL":
return "Vega 20 - hipSYCL/ROCm" # (gfx906)
elif row['device-name'] == "Tesla P100-PCIE-12GB" and row['sycl-implementation'] == "hipSYCL":
return "Tesla P100 - hipSYCL/CUDA"
elif row['device-name'] == "hipCPU OpenMP host device" and row['sycl-implementation'] == "hipSYCL":
return "Xeon Gold - hipSYCL/OpenMP"
elif row['device-name'] == "Tesla P100-PCIE-12GB" and row['sycl-implementation'] == "LLVM CUDA (Codeplay)":
return "Tesla P100 - DPC++/CUDA"
elif row['device-name'] == "SYCL host device" and row['sycl-implementation'] == 'LLVM (Intel DPC++)':
return "Xeon Gold - DPC++/CPU"
#todo: generate and check this one:
elif row['device-name'] == "Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz" and row['sycl-implementation'] == 'LLVM (Intel DPC++)':
return "Xeon Gold - DPC++/OpenCL"
elif row['device-name'] == "Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz" and row['sycl-implementation'] == 'ComputeCpp':
return "Xeon Gold - ComputeCpp/OpenCL"
elif row['device-name'] == "Host Device" and row['sycl-implementation'] == 'ComputeCpp':
return "Xeon Gold - ComputeCpp/CPU"
elif row['device-name'] == 'unknown' and row['sycl-implementation'] == 'triSYCL':
return "Xeon Gold - triSYCL/OpenMP"
else:
print(device)
vec['Runtime'] = vec.apply (lambda row: clear_up_runtime(row), axis=1)
Convert these runtimes to factors.
%%R -i vec -o vec
vec$Runtime <- as.factor(vec$Runtime)
Permanently assign colour to each runtime -- to avoid confusion and colour reuse when plots are broken down into types of accelerator.
%%R -i vec -o colour_scale
# While viridis is a great colour palette, we need high contrast between neighouring elements --like the rainbow palette-- but still need to be colour-blind friendly.
#library('viridisLite')
#colours <- viridisLite::viridis(length(unique(all_res$Runtime)))
#library(RColorBrewer)
#colours <- brewer.pal(length(unique(all_res$Runtime)),'Dark2')
library(scales)
colours <- hue_pal()(length(unique(vec$Runtime)))
names(colours) <- levels(vec$Runtime)
colour_scale <- scale_colour_manual(name = "Runtime",values = colours)
vec
%%R -i vec -i colour_scale -w 8.3 -h 11.7 --units in -r 200
outlier_size = 0.10
library('ggplot2')
library('latex2exp')
library('scales')
names(vec) <- make.names(names(vec), unique = FALSE, allow_ = TRUE)
p1 <- ggplot(vec, aes(x=problem.size, y=run.time.sample, colour=Runtime, group = interaction(problem.size, Runtime))) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Problem Size", y="Execution Time (s)") + geom_boxplot(position="identity",alpha=0.5) + colour_scale + facet_wrap( ~ Benchmark.name, strip.position = "top", scales = "free_x") + scale_x_log10("Problem Size", breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x)))
print(p1)
And under a log-transform on the y-axis:
%%R -i vec -i colour_scale -w 8.3 -h 11.7 --units in -r 200
outlier_size = 0.10
library('ggplot2')
library('latex2exp')
library('scales')
names(vec) <- make.names(names(vec), unique = FALSE, allow_ = TRUE)
p1 <- ggplot(vec, aes(x=problem.size, y=run.time.sample, colour=Runtime, group = interaction(problem.size, Runtime))) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Problem Size", y="Execution Time (s)") + geom_boxplot(position="identity",alpha=0.5) + colour_scale + facet_wrap( ~ Benchmark.name, strip.position = "top", scales = "free_x") + scale_x_log10("Problem Size", breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) + scale_y_log10("Execution Time", breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))
print(p1)
Let's try again with more runtime data... that may clear up any trends -- too speculative with just 50 repeats. Run the following script to collect the results --and feel free to change the --num-runs in the run suite to whatever you'd prefer. The following data was generated with 1000 repeats.
! ./run_vec_add_tests.sh
import pandas
vec = pandas.read_csv('./vec_add_tests_ComputeCPP-cpu.csv',comment='#')
vec = pandas.merge(vec, pandas.read_csv('./vec_add_tests_dpc++-cpu.csv',comment='#'),how='outer')
vec = pandas.merge(vec, pandas.read_csv('./vec_add_tests_hipsycl-cpu.csv',comment='#'),how='outer')
vec = pandas.merge(vec, pandas.read_csv('./vec_add_tests_trisycl-cpu.csv',comment='#'),how='outer')
vec = pandas.merge(vec, pandas.read_csv('./vec_add_tests_ComputeCPP-opencl.csv',comment='#'),how='outer')
print("using sample size of:",len(list(map(float,vec['run-time-samples'][0].split()))),"elements per data-point")
Restructure dataframe to split up the run-time-samples into separate run-time-sample by duplicating each row with a unique sample -- this is for R to do the heavy-lifting by generating the box-and-whisker plots and summary statistics.
from tqdm import tqdm
outdat = pandas.DataFrame()
for index, row in tqdm(vec.iterrows(),total=vec.shape[0]):
samples = row['run-time-samples']
x = samples.split(' ')
for y in x:
tmprow = row
tmprow['run-time-sample'] = float(y)
outdat = outdat.append(tmprow)
outdat = outdat.drop(columns=['run-time-samples'])
vec = outdat
vec
We also add the Runtime variable which is named according to the sycl-implementation and device-name. This is needed because all CPU backends cannot query the device name.
def clear_up_runtime (row):
if row['device-name'] == "Device 66af" and row['sycl-implementation'] == "hipSYCL":
return "Vega 20 - hipSYCL/ROCm" # (gfx906)
elif row['device-name'] == "Tesla P100-PCIE-12GB" and row['sycl-implementation'] == "hipSYCL":
return "Tesla P100 - hipSYCL/CUDA"
elif row['device-name'] == "hipCPU OpenMP host device" and row['sycl-implementation'] == "hipSYCL":
return "Xeon Gold - hipSYCL/OpenMP"
elif row['device-name'] == "Tesla P100-PCIE-12GB" and row['sycl-implementation'] == "LLVM CUDA (Codeplay)":
return "Tesla P100 - DPC++/CUDA"
elif row['device-name'] == "SYCL host device" and row['sycl-implementation'] == 'LLVM (Intel DPC++)':
return "Xeon Gold - DPC++/CPU"
#todo: generate and check this one:
elif row['device-name'] == "Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz" and row['sycl-implementation'] == 'LLVM (Intel DPC++)':
return "Xeon Gold - DPC++/OpenCL"
elif row['device-name'] == "Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz" and row['sycl-implementation'] == 'ComputeCpp':
return "Xeon Gold - ComputeCpp/OpenCL"
elif row['device-name'] == "Host Device" and row['sycl-implementation'] == 'ComputeCpp':
return "Xeon Gold - ComputeCpp/CPU"
elif row['device-name'] == 'unknown' and row['sycl-implementation'] == 'triSYCL':
return "Xeon Gold - triSYCL/OpenMP"
else:
print(device)
vec['Runtime'] = vec.apply (lambda row: clear_up_runtime(row), axis=1)
Convert these runtimes to factors.
%%R -i vec -o vec
vec$Runtime <- as.factor(vec$Runtime)
Permanently assign colour to each runtime -- to avoid confusion and colour reuse when plots are broken down into types of accelerator.
%%R -i vec -o colour_scale
# While viridis is a great colour palette, we need high contrast between neighouring elements --like the rainbow palette-- but still need to be colour-blind friendly.
#library('viridisLite')
#colours <- viridisLite::viridis(length(unique(all_res$Runtime)))
#library(RColorBrewer)
#colours <- brewer.pal(length(unique(all_res$Runtime)),'Dark2')
library(scales)
colours <- hue_pal()(length(unique(vec$Runtime)))
names(colours) <- levels(vec$Runtime)
colour_scale <- scale_colour_manual(name = "Runtime",values = colours)
vec
%%R -i vec -i colour_scale -w 8.3 -h 11.7 --units in -r 200
outlier_size = 0.10
library('ggplot2')
library('latex2exp')
library('scales')
names(vec) <- make.names(names(vec), unique = FALSE, allow_ = TRUE)
p1 <- ggplot(vec, aes(x=problem.size, y=run.time.sample, colour=Runtime, group = interaction(problem.size, Runtime))) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Problem Size", y="Execution Time (s)") + geom_boxplot(position="identity",alpha=0.5,outlier.size = 0.01) + colour_scale + facet_wrap( ~ Benchmark.name, strip.position = "top", scales = "free_x") + scale_x_log10("Problem Size", breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x)))
print(p1)
And under a log-transform on the y-axis:
%%R -i vec -i colour_scale -w 8.3 -h 11.7 --units in -r 200
outlier_size = 0.10
library('ggplot2')
library('latex2exp')
library('scales')
names(vec) <- make.names(names(vec), unique = FALSE, allow_ = TRUE)
p1 <- ggplot(vec, aes(x=problem.size, y=run.time.sample, colour=Runtime, group = interaction(problem.size, Runtime))) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Problem Size", y="Execution Time (s)") + geom_boxplot(position="identity",alpha=0.5, outlier.size = 0.01) + colour_scale + facet_wrap( ~ Benchmark.name, strip.position = "top", scales = "free_x") + scale_x_log10("Problem Size", breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) + scale_y_log10("Execution Time", breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))
print(p1)
Repeats (--run-runs) were set to 250 and the problem sizes incrementally increasing over the range of 2^8 to 2^30, to examine the longer term effects, such as viewing how performance tapers off between SYCL backend and implementation.
Generated with the following script:
! ./run_vec_add_big_tests.sh
import pandas
vec = pandas.read_csv('./vec_add_big_tests_ComputeCPP-cpu.csv',comment='#')
vec = pandas.merge(vec, pandas.read_csv('./vec_add_big_tests_dpc++-cpu.csv',comment='#'),how='outer')
vec = pandas.merge(vec, pandas.read_csv('./vec_add_big_tests_hipsycl-cpu.csv',comment='#'),how='outer')
vec = pandas.merge(vec, pandas.read_csv('./vec_add_big_tests_trisycl-cpu.csv',comment='#'),how='outer')
vec = pandas.merge(vec, pandas.read_csv('./vec_add_big_tests_ComputeCPP-opencl.csv',comment='#'),how='outer')
print("using sample size of:",len(list(map(float,vec['run-time-samples'][0].split()))),"elements per data-point")
Restructure dataframe to split up the run-time-samples into separate run-time-sample by duplicating each row with a unique sample -- this is for R to do the heavy-lifting by generating the box-and-whisker plots and summary statistics.
from tqdm import tqdm
outdat = pandas.DataFrame()
for index, row in tqdm(vec.iterrows(),total=vec.shape[0]):
samples = row['run-time-samples']
x = samples.split(' ')
for y in x:
tmprow = row
tmprow['run-time-sample'] = float(y)
outdat = outdat.append(tmprow)
outdat = outdat.drop(columns=['run-time-samples'])
vec = outdat
vec
We also add the Runtime variable which is named according to the sycl-implementation and device-name. This is needed because all CPU backends cannot query the device name.
def clear_up_runtime (row):
if row['device-name'] == "Device 66af" and row['sycl-implementation'] == "hipSYCL":
return "Vega 20 - hipSYCL/ROCm" # (gfx906)
elif row['device-name'] == "Tesla P100-PCIE-12GB" and row['sycl-implementation'] == "hipSYCL":
return "Tesla P100 - hipSYCL/CUDA"
elif row['device-name'] == "hipCPU OpenMP host device" and row['sycl-implementation'] == "hipSYCL":
return "Xeon Gold - hipSYCL/OpenMP"
elif row['device-name'] == "Tesla P100-PCIE-12GB" and row['sycl-implementation'] == "LLVM CUDA (Codeplay)":
return "Tesla P100 - DPC++/CUDA"
elif row['device-name'] == "SYCL host device" and row['sycl-implementation'] == 'LLVM (Intel DPC++)':
return "Xeon Gold - DPC++/CPU"
#todo: generate and check this one:
elif row['device-name'] == "Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz" and row['sycl-implementation'] == 'LLVM (Intel DPC++)':
return "Xeon Gold - DPC++/OpenCL"
elif row['device-name'] == "Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz" and row['sycl-implementation'] == 'ComputeCpp':
return "Xeon Gold - ComputeCpp/OpenCL"
elif row['device-name'] == "Host Device" and row['sycl-implementation'] == 'ComputeCpp':
return "Xeon Gold - ComputeCpp/CPU"
elif row['device-name'] == 'unknown' and row['sycl-implementation'] == 'triSYCL':
return "Xeon Gold - triSYCL/OpenMP"
else:
print(device)
vec['Runtime'] = vec.apply (lambda row: clear_up_runtime(row), axis=1)
Convert these runtimes to factors.
%%R -i vec -o vec
vec$Runtime <- as.factor(vec$Runtime)
Permanently assign colour to each runtime -- to avoid confusion and colour reuse when plots are broken down into types of accelerator.
%%R -i vec -o colour_scale
# While viridis is a great colour palette, we need high contrast between neighouring elements --like the rainbow palette-- but still need to be colour-blind friendly.
#library('viridisLite')
#colours <- viridisLite::viridis(length(unique(all_res$Runtime)))
#library(RColorBrewer)
#colours <- brewer.pal(length(unique(all_res$Runtime)),'Dark2')
library(scales)
colours <- hue_pal()(length(unique(vec$Runtime)))
names(colours) <- levels(vec$Runtime)
colour_scale <- scale_colour_manual(name = "Runtime",values = colours)
vec
x = vec[(vec['Runtime'] == "Xeon Gold - ComputeCpp/OpenCL")]
x = x[x["problem-size"] == 256]
x = x[x['Benchmark name'] == "VectorAddition_int32"]
len(x) #why was the OpenCL version running the wrong test?
%%R -i vec -i colour_scale -w 8.3 -h 11.7 --units in -r 200
outlier_size = 0.10
library('ggplot2')
library('latex2exp')
library('scales')
names(vec) <- make.names(names(vec), unique = FALSE, allow_ = TRUE)
p1 <- ggplot(vec, aes(x=problem.size, y=run.time.sample, colour=Runtime, group = interaction(problem.size, Runtime))) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Problem Size", y="Execution Time (s)") + geom_boxplot(position="identity",alpha=0.5,outlier.size = 0.01) + colour_scale + facet_wrap( ~ Benchmark.name, strip.position = "top", scales = "free_x") + scale_x_log10("Problem Size", breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x)))
print(p1)
And under a log-transform on the y-axis:
%%R -i vec -i colour_scale -w 8.3 -h 11.7 --units in -r 200
outlier_size = 0.10
library('ggplot2')
library('latex2exp')
library('scales')
names(vec) <- make.names(names(vec), unique = FALSE, allow_ = TRUE)
p1 <- ggplot(vec, aes(x=problem.size, y=run.time.sample, colour=Runtime, group = interaction(problem.size, Runtime))) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Problem Size", y="Execution Time (s)") + geom_boxplot(position="identity",alpha=0.5, outlier.size = 0.01) + colour_scale + facet_wrap( ~ Benchmark.name, strip.position = "top", scales = "free_x") + scale_x_log10("Problem Size", breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) + scale_y_log10("Execution Time", breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))
print(p1)
Note rerun these tests when the system is idle.
This outlines the experiment used to see the effect of local-workgroup sizes on different architectures. In particular, are the largest possible even division of tasks by the number of cores the most suited to CPU architectures reguardless of backend? This would mitigate the overhead in the backend languages scheduling policy. What about GPU backends -- does the optimal local workgroup size vary between backends despite using exactly the same architecture?
Unfortunately, the Vector Addition benchmark is in the BKP SYCL Computational Construct and thus does not support setting local workgroup sizes. Instead we add a new version of Vector Addition vec_add_wgp with this functionality.
To generate this data on hipSYCL with OpenMP:
#the default/baseline sycl bkp -- where no workgroups are specified
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add --size=1073741824 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
#the serial baseline version -- uses no parallelism
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_serial --size=1073741824 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
#wgp kernels with the OpenMP Number of threads statically set from the command line -- we should know more about the parallelism available in the application than the runtime.
#This set-num-threads should be no greater than the physical cores available in the system.
! OMP_NUM_THREADS=1 LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=1073741824 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! OMP_NUM_THREADS=2 LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=536870912 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! OMP_NUM_THREADS=4 LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=268435456 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! OMP_NUM_THREADS=8 LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=134217728 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! OMP_NUM_THREADS=16 LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=67108864 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! OMP_NUM_THREADS=32 LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=33554432 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
#wgp kernels without overriding the core utilization in OpenMP.
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=32 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=64 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=128 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=256 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=512 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=1024 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=2048 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=4096 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=8192 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=16384 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=32768 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=65536 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=131072 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=262144 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=524288 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=1048576 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=2097152 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=4194304 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=8388608 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=16777216 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=33554432 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=67108864 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=134217728 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=268435456 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=536870912 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-hipsycl-cpu/vec_add_wgp --size=1073741824 --local=1073741824 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
To generate this data on triSYCL with OpenMP:
#the default/baseline sycl bkp -- where no workgroups are specified
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add --size=1073741824 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
#the serial baseline version -- uses no parallelism
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_serial --size=1073741824 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
#wgp kernels with the OpenMP Number of threads statically set from the command line -- we should know more about the parallelism available in the application than the runtime.
#This set-num-threads should be no greater than the physical cores available in the system.
! OMP_NUM_THREADS=1 LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=1073741824 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! OMP_NUM_THREADS=2 LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=536870912 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! OMP_NUM_THREADS=4 LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=268435456 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! OMP_NUM_THREADS=8 LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=134217728 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! OMP_NUM_THREADS=16 LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=67108864 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! OMP_NUM_THREADS=32 LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=33554432 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
#wgp kernels without overriding the core utilization in OpenMP.
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=32 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=64 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=128 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=256 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=512 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=1024 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=2048 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=4096 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=8192 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=16384 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=32768 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=65536 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=131072 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=262144 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=524288 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=1048576 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=2097152 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=4194304 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=8388608 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=16777216 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=33554432 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=67108864 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=134217728 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=268435456 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=536870912 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib ./new-build-trisycl-cpu/vec_add_wgp --size=1073741824 --local=1073741824 --num-runs=100 --no-verification --output="./local_workgroup_overheads_with_sycl_implementations.csv"
import pandas
vec = pandas.read_csv('./local_workgroup_overheads_with_sycl_implementations.csv', comment='#')
#Rearrange and clean the data
from tqdm import tqdm
outdat = pandas.DataFrame()
for index, row in tqdm(vec.iterrows(),total=vec.shape[0]):
samples = row['run-time-samples']
x = samples.split(' ')
for y in x:
tmprow = row
tmprow['run-time-sample'] = float(y)
outdat = outdat.append(tmprow)
outdat = outdat.drop(columns=['run-time-samples'])
vec = outdat
#import pandas
#vec = pandas.read_csv('./split_test.csv',comment='#')
#vec['local-size'].unique()
def clear_up_runtime (row):
return str(row['sycl-implementation'])
vec['runtime'] = vec.apply (lambda row: clear_up_runtime(row), axis=1)
vec = vec.drop(columns=['Verification'])
vec
%load_ext rpy2.ipython
%%R -i vec -w 8.3 -h 11.7 --units in -r 200
outlier_size = 0.10
library('ggplot2')
library('latex2exp')
library('scales')
names(vec) <- make.names(names(vec), unique = FALSE, allow_ = TRUE)
p1 <- ggplot(vec, aes(x=local.size, y=run.time.sample, colour=sycl.implementation, group = interaction(local.size, sycl.implementation))) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Local Size", y="Execution Time (s)") + geom_boxplot(position="dodge2",alpha=0.5,outlier.size = 0.01) + facet_wrap( ~ Benchmark.name, strip.position = "top", scales = "free_x") + scale_x_log10("Local Workgroup Size", breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) + scale_y_log10("Execution Time") + geom_hline(yintercept=median(subset(vec, sycl.implementation == "hipSYCL-serial")$run.time.sample), linetype="dashed", color = "blue")
ggsave('hipsycl-openmp-workgroup-size-performance.pdf', p1, device="pdf",width=8.3, height=11.7, units="in",dpi=320)
print(p1)
%%R -i vec -w 8.3 -h 11.7 --units in -r 200
outlier_size = 0.10
library('ggplot2')
library('latex2exp')
library('scales')
names(vec) <- make.names(names(vec), unique = FALSE, allow_ = TRUE)
vec$num.workgroups = as.integer(vec$problem.size / vec$local.size)
p1 <- ggplot(vec, aes(x=num.workgroups, y=run.time.sample, colour=sycl.implementation, group = interaction(local.size, sycl.implementation))) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "VectorAddition Implementation", x="Number of Workgroups", y="Execution Time (s)") + geom_boxplot(position="dodge2",alpha=0.5,outlier.size = 0.01) + facet_wrap( ~ Benchmark.name, strip.position = "top", scales = "free_x") + scale_x_log10("Number Of Workgroups", breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) + scale_y_log10("Execution Time") + geom_hline(yintercept=median(subset(vec, sycl.implementation == "hipSYCL-serial")$run.time.sample), linetype="dashed", color = "blue")
ggsave('hipsycl-openmp-and-using-workgroups.pdf', p1, device="pdf",width=8.3, height=11.7, units="in",dpi=320)
print(p1)
import pandas
vec = pandas.read_csv('./trisycl.csv', comment='#')
#Rearrange and clean the data
from tqdm import tqdm
outdat = pandas.DataFrame()
for index, row in tqdm(vec.iterrows(),total=vec.shape[0]):
samples = row['run-time-samples']
x = samples.split(' ')
for y in x:
tmprow = row
tmprow['run-time-sample'] = float(y)
outdat = outdat.append(tmprow)
outdat = outdat.drop(columns=['run-time-samples'])
vec = outdat
def clear_up_runtime (row):
return str(row['sycl-implementation'])
vec['runtime'] = vec.apply (lambda row: clear_up_runtime(row), axis=1)
vec = vec.drop(columns=['Verification'])
%%R -i vec -w 8.3 -h 11.7 --units in -r 200
outlier_size = 0.10
library('ggplot2')
library('latex2exp')
library('scales')
names(vec) <- make.names(names(vec), unique = FALSE, allow_ = TRUE)
p1 <- ggplot(vec, aes(x=local.size, y=run.time.sample, colour=sycl.implementation, group = interaction(local.size, sycl.implementation))) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Local Size", y="Execution Time (s)") + geom_boxplot(position="dodge2",alpha=0.5,outlier.size = 0.01) + facet_wrap( ~ Benchmark.name, strip.position = "top", scales = "free_x") + scale_x_log10("Local Workgroup Size", breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) + scale_y_log10("Execution Time") + geom_hline(yintercept=median(subset(vec, sycl.implementation == "triSYCL-serial")$run.time.sample), linetype="dashed", color = "blue")
ggsave('trisycl-openmp-workgroup-size-performance.pdf', p1, device="pdf",width=8.3, height=11.7, units="in",dpi=320)
print(p1)
vec$num.workgroups = as.integer(vec$problem.size / vec$local.size)
p2 <- ggplot(vec, aes(x=num.workgroups, y=run.time.sample, colour=sycl.implementation, group = interaction(local.size, sycl.implementation))) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "VectorAddition Implementation", x="Number of Workgroups", y="Execution Time (s)") + geom_boxplot(position="dodge2",alpha=0.5,outlier.size = 0.01) + facet_wrap( ~ Benchmark.name, strip.position = "top", scales = "free_x") + scale_x_log10("Number Of Workgroups", breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) + scale_y_log10("Execution Time") + geom_hline(yintercept=median(subset(vec, sycl.implementation == "triSYCL-serial")$run.time.sample), linetype="dashed", color = "blue")
ggsave('trisycl-openmp-and-using-workgroups.pdf', p2, device="pdf",width=8.3, height=11.7, units="in",dpi=320)
print(p2)
import pandas
vec = pandas.read_csv('./vec_add_wgp_hipsycl-cpu.csv', comment='#')
#Rearrange and clean the data
from tqdm import tqdm
outdat = pandas.DataFrame()
for index, row in tqdm(vec.iterrows(),total=vec.shape[0]):
samples = row['run-time-samples']
x = samples.split(' ')
for y in x:
tmprow = row
tmprow['run-time-sample'] = float(y)
outdat = outdat.append(tmprow)
outdat = outdat.drop(columns=['run-time-samples'])
vec = outdat
def clear_up_runtime (row):
return str(row['sycl-implementation'])
vec['runtime'] = vec.apply (lambda row: clear_up_runtime(row), axis=1)
vec = vec.drop(columns=['Verification'])
%%R -i vec -w 8.3 -h 11.7 --units in -r 200
outlier_size = 0.10
library('ggplot2')
library('latex2exp')
library('scales')
names(vec) <- make.names(names(vec), unique = FALSE, allow_ = TRUE)
p1 <- ggplot(vec, aes(x=local.size, y=run.time.sample, colour=sycl.implementation, group = interaction(local.size, sycl.implementation))) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Local Size", y="Execution Time (s)") + geom_boxplot(position="dodge2",alpha=0.5,outlier.size = 0.01) + facet_wrap( ~ Benchmark.name, strip.position = "top", scales = "free_x") + scale_x_log10("Local Workgroup Size", breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) + scale_y_log10("Execution Time") + expand_limits(y = 0) + geom_hline(yintercept=median(subset(vec, sycl.implementation == "triSYCL-serial")$run.time.sample), linetype="dashed", color = "blue")
ggsave('trisycl-openmp-workgroup-size-performance.pdf', p1, device="pdf",width=8.3, height=11.7, units="in",dpi=320)
print(p1)
vec$num.workgroups = as.integer(vec$problem.size / vec$local.size)
p2 <- ggplot(vec, aes(x=num.workgroups, y=run.time.sample, colour=sycl.implementation, group = interaction(local.size, sycl.implementation))) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "VectorAddition Implementation", x="Number of Workgroups", y="Execution Time (s)") + geom_boxplot(position="dodge2",alpha=0.5,outlier.size = 0.01) + facet_wrap( ~ Benchmark.name, strip.position = "top", scales = "free_x") + scale_x_log10("Number Of Workgroups", breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) + scale_y_log10("Execution Time") + expand_limits(y = 0) + geom_hline(yintercept=median(subset(vec, sycl.implementation == "triSYCL-serial")$run.time.sample), linetype="dashed", color = "blue")
ggsave('trisycl-openmp-and-using-workgroups.pdf', p2, device="pdf",width=8.3, height=11.7, units="in",dpi=320)
print(p2)
Memory access patterns on the CPU architecture can be traced via cache access and miss rates. We use perf as part of the linux-tools-generic package.
! LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hipSYCL/llvm/lib perf stat -B -e cache-references,cache-misses ./matmul_bkp --size=1024 --num-runs=100
import pandas
#matmul = pandas.read_csv('./results/sycl-matmul-implementations-cache-miss-and-execution-times-of-cpu-based-sycl-on-xeon-gold-6134.csv', comment='#')
matmul = pandas.read_csv('./results/sycl-matmul-implementations-cache-miss-and-execution-times-sycl-on-xeon-gold-6134-and-p100.csv', comment='#')
matmul.loc[matmul['Local Size'] == '-','Local Size'] = 1
matmul["Local Size"] = pandas.to_numeric(matmul["Local Size"])
#remove entries without runtimes -- corresponds to tests that won't run or take too long to measure
matmul = matmul.dropna()
from tqdm import tqdm
outdat = pandas.DataFrame()
for index, row in tqdm(matmul.iterrows(),total=matmul.shape[0]):
samples = row['Runtimes']
try:
x = samples.split(' ')
for y in x:
tmprow = row
tmprow['Runtime'] = float(y)
outdat = outdat.append(tmprow)
except:
print(samples)
outdat = outdat.drop(columns=['Runtimes'])
matmul = outdat
matmul
df = matmul
df = df.replace('ComputeCPP OpenCL-Gold','ComputeCPP OpenCL - Gold')
df = df.replace('ComputeCPP pthreads','ComputeCPP pthreads - Gold')
df = df.replace('DPC++ CUDA','DPC++ CUDA - P100')
df = df.replace('DPC++ pthreads','DPC++ pthreads - Gold')
df = df.replace('hipSYCL CUDA','hipSYCL CUDA - P100')
df = df.replace('hipSYCL OpenMP','hipSYCL OpenMP - Gold')
df = df.replace('hipSYCL ROCm','hipSYCL ROCm - gfx906')
df = df.replace('triSYCL OpenMP','triSYCL OpenMP - Gold')
matmul = df
%%R -i matmul -w 11.7 -h 8.3 --units in -r 200
outlier_size = 0.10
library('ggplot2')
library('latex2exp')
library('scales')
#library('ggpattern')
names(matmul) <- make.names(names(matmul), unique = FALSE, allow_ = TRUE)
#print(matmul$Implementation.and.Backend)
matmul$Implementation.and.Backend <- factor(matmul$Implementation.and.Backend, levels = c("ComputeCPP pthreads - Gold", "DPC++ pthreads - Gold", "hipSYCL OpenMP - Gold", "triSYCL OpenMP - Gold", "ComputeCPP OpenCL - Gold", "DPC++ CUDA - P100","hipSYCL CUDA - P100", "hipSYCL ROCm - gfx906"))
#print(matmul$Version <- as.factor(matmul$Version))
#matmul$Local.Size=as.numeric(levels(matmul$Local.Size))[matmul$Local.Size]
#colour=Implementation.and.Backend, fill=Version, group = interaction(Local.Size, Implementation.and.Backend,Version)
#p1 <- ggplot(matmul, aes(x=Local.Size, y=Runtime,group=interaction(Local.Size,Implementation.and.Backend,Version))) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Local Size", y="Execution Time (s)") + ggpattern::geom_boxplot_pattern(aes(pattern=Version,fill=Version,color=Version),pattern_spacing=0.02, position="dodge2",alpha=0.5,outlier.size = 0.01) + scale_x_log10("Local Workgroup Size", breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) + scale_y_log10("Log(Execution Time[s])") + expand_limits(y = 0) + geom_hline(yintercept=median(subset(matmul, version == "Serial")$Runtime), linetype="dashed", color = "blue")
#p <- ggplot(mpg, aes(class, hwy)) +
# geom_boxplot_pattern(
# aes(
# pattern = class,
# pattern_fill = class
# ),
# pattern_spacing = 0.03
# ) +
# theme_bw(18) +
# labs(title = "ggpattern::geom_boxplot_pattern()") +
# theme(legend.position = 'none') +
# coord_fixed(1/8)
#print(p)
#ggsave('trisycl-openmp-workgroup-size-performance.pdf', p1, device="pdf",width=8.3, height=11.7, units="in",dpi=320)
p1 <- ggplot(matmul, aes(x=Local.Size, y=Runtime,colour=Version,group=interaction(Local.Size,Version))) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6),legend.position="bottom") + expand_limits(y = 0) + labs(colour = "SYCL Parallel Construct", x="Local Size", y="Execution Time (s)") + geom_boxplot(position="dodge2",alpha=0.5,outlier.size = 0.01) + scale_x_log10("Local Workgroup Size", breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) + scale_y_log10(TeX("Execution Time $\\Log_{10}$(s)")) + expand_limits(y = 0,x = 1024) + facet_wrap( ~ Implementation.and.Backend, strip.position = "top", scales = "free_x", nrow=2) +
geom_hline(yintercept=median(subset(matmul, Version == "Serial")$Runtime), linetype="dashed", color = "turquoise3")
#print(median(subset(matmul, Version == "Serial")$Runtime))
ggsave('matmul-local-performance.pdf', p1, device="pdf",width=11.7, height=8.3, units="in",dpi=320)
print(p1)
Alternatively, the number of workgroups -- inverse to local workgroup size.
%%R -i matmul -w 11.7 -h 8.3 --units in -r 200
outlier_size = 0.10
library('ggplot2')
library('latex2exp')
library('scales')
library('ggpattern')
names(matmul) <- make.names(names(matmul), unique = FALSE, allow_ = TRUE)
matmul$Implementation.and.Backend <- factor(matmul$Implementation.and.Backend, levels = c("ComputeCPP pthreads - Gold", "DPC++ pthreads - Gold", "hipSYCL OpenMP - Gold", "triSYCL OpenMP - Gold", "ComputeCPP OpenCL - Gold", "DPC++ CUDA - P100","hipSYCL CUDA - P100", "hipSYCL ROCm - gfx906"))
matmul$Number.of.Workgroups = as.integer(1024/matmul$Local.Size)
p2 <- ggplot(matmul, aes(x=Number.of.Workgroups, y=Runtime,colour=Version,group=interaction(Local.Size,Version))) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Execution Construct", x="Local Size", y="Execution Time (s)") + geom_boxplot(position="dodge2",alpha=0.5,outlier.size = 0.01) + scale_x_log10("Number of Local Workgroups", breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) + scale_y_log10(TeX("Execution Time $\\Log_{10}$(s)")) + expand_limits(y = 0) + facet_wrap( ~ Implementation.and.Backend, strip.position = "top", scales = "free_x", nrow=2)
ggsave('matmul-number-of-workgroups-performance.pdf', p2, device="pdf",width=8.3, height=11.7, units="in",dpi=320)
print(p2)
Percentage of Missed Cache References:
%%R -i matmul -w 11.7 -h 8.3 --units in -r 200
outlier_size = 0.10
library('ggplot2')
library('latex2exp')
library('scales')
library('ggpattern')
names(matmul) <- make.names(names(matmul), unique = FALSE, allow_ = TRUE)
matmul$Implementation.and.Backend <- factor(matmul$Implementation.and.Backend, levels = c("ComputeCPP pthreads - Gold", "DPC++ pthreads - Gold", "hipSYCL OpenMP - Gold", "triSYCL OpenMP - Gold", "ComputeCPP OpenCL - Gold", "DPC++ CUDA - P100","hipSYCL CUDA - P100", "hipSYCL ROCm - gfx906"))
p1 <- ggplot(matmul, aes(x=Local.Size, y=Missed...of.all.Cache.References,colour=Version,group=interaction(Local.Size,Version))) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6),legend.position="bottom") + expand_limits(y = 0) + labs(colour = "SYCL Parallelism Construct", x="Local Size", y="Missed (%) of Cache References") + geom_boxplot(position="dodge2",alpha=0.5,outlier.size = 0.01) + scale_x_log10("Local Workgroup Size", breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) + expand_limits(y = 0) + facet_wrap( ~ Implementation.and.Backend, strip.position = "top", scales = "free_x", nrow=2) + geom_hline(yintercept=median(subset(matmul, version == "Serial")$Runtime), linetype="dashed", color = "blue")
ggsave('matmul-local-cache-miss.pdf', p1, device="pdf",width=8.3, height=11.7, units="in",dpi=320)
print(p1)
Absolute Number of Cache References -- for 100 repeats/samples:
%%R -i matmul -w 11.7 -h 8.3 --units in -r 200
outlier_size = 0.10
library('ggplot2')
library('latex2exp')
library('scales')
library('ggpattern')
names(matmul) <- make.names(names(matmul), unique = FALSE, allow_ = TRUE)
matmul$Implementation.and.Backend <- factor(matmul$Implementation.and.Backend, levels = c("ComputeCPP pthreads - Gold", "DPC++ pthreads - Gold", "hipSYCL OpenMP - Gold", "triSYCL OpenMP - Gold", "ComputeCPP OpenCL - Gold", "DPC++ CUDA - P100","hipSYCL CUDA - P100", "hipSYCL ROCm - gfx906"))
p1 <- ggplot(matmul, aes(x=Local.Size, y=Cache.Reference.Count,colour=Version,group=interaction(Local.Size,Version))) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Execution Construct", x="Local Size", y="Number of Cache References") + geom_boxplot(position="dodge2",alpha=0.5,outlier.size = 0.01) + scale_x_log10("Local Workgroup Size", breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) + expand_limits(y = 0) + facet_wrap( ~ Implementation.and.Backend, strip.position = "top", scales = "free_x", nrow=2) + geom_hline(yintercept=median(subset(matmul, version == "Serial")$Runtime), linetype="dashed", color = "blue")
ggsave('matmul-local-total-cache-references.pdf', p1, device="pdf",width=8.3, height=11.7, units="in",dpi=320)
print(p1)
Absolute Number of Cache Misses -- for 100 repeats/samples:
%%R -i matmul -w 11.7 -h 8.3 --units in -r 200
outlier_size = 0.10
library('ggplot2')
library('latex2exp')
library('scales')
library('ggpattern')
names(matmul) <- make.names(names(matmul), unique = FALSE, allow_ = TRUE)
matmul$Implementation.and.Backend <- factor(matmul$Implementation.and.Backend, levels = c("ComputeCPP pthreads - Gold", "DPC++ pthreads - Gold", "hipSYCL OpenMP - Gold", "triSYCL OpenMP - Gold", "ComputeCPP OpenCL - Gold", "DPC++ CUDA - P100","hipSYCL CUDA - P100", "hipSYCL ROCm - gfx906"))
p1 <- ggplot(matmul, aes(x=Local.Size, y=Cache.Miss.Count,colour=Version,group=interaction(Local.Size,Version))) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Execution Construct", x="Local Size", y="Number of Cache Misses") + geom_boxplot(position="dodge2",alpha=0.5,outlier.size = 0.01) + scale_x_log10("Local Workgroup Size", breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) + expand_limits(y = 0) + facet_wrap( ~ Implementation.and.Backend, strip.position = "top", scales = "free_x", nrow=2) + geom_hline(yintercept=median(subset(matmul, version == "Serial")$Runtime), linetype="dashed", color = "blue")
ggsave('matmul-local-total-cache-miss.pdf', p1, device="pdf",width=8.3, height=11.7, units="in",dpi=320)
print(p1)
Percentage of Missed Cache References for just the CPU:
%%R -i matmul -w 11.7 -h 4.15 --units in -r 200
outlier_size = 0.10
library('ggplot2')
library('latex2exp')
library('scales')
#library('ggpattern')
names(matmul) <- make.names(names(matmul), unique = FALSE, allow_ = TRUE)
matmul$Implementation.and.Backend <- factor(matmul$Implementation.and.Backend, levels = c("ComputeCPP pthreads - Gold", "DPC++ pthreads - Gold", "hipSYCL OpenMP - Gold", "triSYCL OpenMP - Gold", "ComputeCPP OpenCL - Gold", "DPC++ CUDA - P100","hipSYCL CUDA - P100", "hipSYCL ROCm - gfx906"))
matmul <- subset(matmul,Implementation.and.Backend!="DPC++ CUDA - P100" & Implementation.and.Backend!="hipSYCL CUDA - P100" & Implementation.and.Backend!="hipSYCL ROCm - gfx906")
#having "Gold" in the names of the backend is redundant -- this is only showing the CPU device for cache misses!
matmul$Implementation.and.Backend <- sub(" - Gold$", "", matmul$Implementation.and.Backend)
p1 <- ggplot(matmul, aes(x=Local.Size, y=Missed...of.all.Cache.References,colour=Version,group=interaction(Local.Size,Version))) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6),legend.position="bottom") + expand_limits(y = 0) + labs(colour = "SYCL Parallel Construct", x="Local Size", y="Missed (%) of Cache References") + geom_boxplot(position="dodge2",alpha=0.5,outlier.size = 0.01) + scale_x_log10("Local Workgroup Size", breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) + expand_limits(y = 0, x=1024) + facet_wrap( ~ Implementation.and.Backend, strip.position = "top", scales = "free_x", nrow=1) + geom_hline(yintercept=median(subset(matmul, version == "Serial")$Runtime), linetype="dashed", color = "blue")
ggsave('matmul-cpu-local-cache-miss.pdf', p1, device="pdf",width=11.7, height=4.15, units="in",dpi=320)
print(p1)
wgp_fp['Benchmark name'] = wgp_fp['Benchmark name'].str.replace("_NDRange","")
hdp['Benchmark name'] = hdp['Benchmark name'].str.replace("_Hierarchical","")
hdp['Benchmark name'] = hdp['Benchmark name'].str.replace("_HierarchicalParallelFor","")
task['Benchmark name'] = task['Benchmark name'].str.replace("_SingleTask","")
sync['Benchmark name'] = sync['Benchmark name'].str.replace("_NDRange","")
%%R -i bkp_fp32 -i wgp_fp -i hdp -i task -i sync -i colour_scale -h 11.7 -w 8.3 --units in -r 200
outlier_size = 0.10
library('ggplot2')
library('latex2exp')
names(bkp_fp32) <- make.names(names(bkp_fp32), unique = FALSE, allow_ = TRUE)
p1 <- ggplot(bkp_fp32, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark Name", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale
#ggsave('bkp-float.pdf', p1, device="pdf",width=11.7, height=8.3, units="in",dpi=320)
names(wgp_fp) <- make.names(names(wgp_fp), unique = FALSE, allow_ = TRUE)
wgp_fp$data.type.width <- reorder(wgp_fp$data.type.width, as.numeric(wgp_fp$data.type.width))
wgp_fp$Benchmark.name <- gsub("_NDRange", "", wgp_fp$Benchmark.name)
wgp_fp <- subset(wgp_fp, data.type.width=="32")
levels(wgp_fp$data.type.width) <- paste("fp",levels(wgp_fp$data.type.width),sep='')
p2 <- ggplot(wgp_fp, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale
names(hdp) <- make.names(names(hdp), unique = FALSE, allow_ = TRUE)
p3 <- ggplot(hdp, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 30, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale + scale_x_discrete(limits = rev(unique(hdp$Benchmark.name)))
names(task) <- make.names(names(task), unique = FALSE, allow_ = TRUE)
p4 <- ggplot(task, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 30, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="SYCL-Bench Application", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale
names(sync) <- make.names(names(sync), unique = FALSE, allow_ = TRUE)
p5 <- ggplot(sync, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="SYCL-Bench Application", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale
library('cowplot')
pg <- plot_grid(p1 + theme(legend.position="none", axis.title.x=element_blank()),
p2 + theme(legend.position="none", axis.title.x=element_blank()),
p3 + theme(legend.position="none", axis.title.x=element_blank()),
p4 + theme(legend.position="none", axis.title.x=element_blank()),
p5 + theme(legend.position="none", axis.title.x=element_blank()),
align = 'vh', hjust = -2, ncol = 2)
pg <- pg + draw_plot_label(label=c("a)", "b)", "c)", "d)","e)"),
x=c(0.28,0.8,0.28,0.8,0.28),
y=c(0.69,0.69,0.36,0.36,0.025),
hjust=.5, vjust=.5, size=12)
#bottom legend
legend <- get_legend(p1 + guides(color = guide_legend(ncol = 1,title="SYCL Runtime")) + theme(legend.position = "right"))
#bp <- plot_grid(pg, legend, rel_heights = c(1, .1),nrow=2)
pg <- pg + draw_grob(legend, 0.75, -0.3, 0, 1)
print(pg)
ggsave('sycl-bench-res.pdf', pg, device="pdf",height=11.7, width=8.3, units="in",dpi=320)
%%R -i bkp_bandw -i bkp_block512 -i colour_scale -w 11.7 -h 4.15 --units in -r 200
outlier_size = 0.10
library('ggplot2')
library('latex2exp')
names(bkp_block512) <- make.names(names(bkp_block512), unique = FALSE, allow_ = TRUE)
bkp_block512$blocksize <- reorder(bkp_block512$blocksize, as.numeric(bkp_block512$blocksize))
p0 <- ggplot(bkp_block512, aes(x=blocksize, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Blocksize", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale
names(bkp_bandw) <- make.names(names(bkp_bandw), unique = FALSE, allow_ = TRUE)
p1 <- ggplot(bkp_bandw, aes(x=Benchmark.name, y=run.time.sample, colour=Runtime)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + expand_limits(y = 0) + labs(colour = "SYCL Runtime", x="Benchmark", y=TeX("Execution Time $\\Log_{10}$(s)")) + scale_y_continuous(trans='log10') + geom_boxplot(outlier.size = outlier_size) + colour_scale
library('cowplot')
pg <- plot_grid(p0 + theme(legend.position="none"),
p1 + theme(legend.position="none",plot.margin=unit(c(0,0,0.5,0),"cm")),
align = 'vh', hjust = -3, ncol = 2)
pg <-pg + draw_plot_label(label=c("a)", "b)"),
x=c(0.28,0.8),
y=c(0.025,0.025),
hjust=.5, vjust=.5, size=12)
#bottom legend
legend <- get_legend(p1 + guides(color = guide_legend(ncol = 1,title="SYCL Runtime")) + theme(legend.position = "right"))
bg <- plot_grid(pg, legend, rel_widths = c(0.75, .2),ncol=2)
#pg <- pg + draw_grob(legend, 0.75, -0.3, 0, 1)
#print(pg)
print(bg)
ggsave('microbench.pdf', bg, device="pdf",width=11.7, height=4.15, units="in",dpi=320)